record my colorful life!

给micolog写的一个插件,用于自动保存远程图片。

实现了:

  1. 自动解析文章中图片,并异步存储到gae的storage中
  2. 可以自定义ignore_domain_list,用于定义那些比较稳定的图床网站,当有来自于这些网站上的图片就不用保存到本地了

还不太完善,经常会提示保存出错,而且cpu占用比较高。会慢慢改进的。

goto micolog plugin download page

贴上代码,主要用了异步request,喜欢的同学随意使用,代码比较烂就是了

 

#!/usr/bin/env python
#coding=utf-8

'''
Created on Sep 19, 2010
Automatically save remote images to local gae storage
@author: wade (wade.beyond@gmail.com)
'''
import re
import uuid
import logging

from google.appengine.api import urlfetch

from micolog_plugin import Plugin, OptionSet
from model import Media

def _finish_fetch_remote_image(entry):
    entry.srip_remote_images -= 1
    if entry.srip_remote_images == 0:
        entry.put()
        logging.info('Finish fetch all remote images for entry(title=%s)' % entry.title)

def _handle_failed_fetch_remote_image(entry, dummy_image_url, remote_image_url, error):
    logging.info("Failed to fetch remote image(%s) for entry(%s).\nerror: %s" %
        (remote_image_url, entry.title, str(error)), exc_info=True)
    entry.content = entry.content.replace(dummy_image_url, remote_image_url)
    _finish_fetch_remote_image(entry)

def _handle_succeed_fetch_remote_image(response, entry, remote_image_url, local_image_name):
    logging.info('Succeed get remote image in _handle_succeed_fetch_remote_image remote_image_url=%s, local_image_name=%s' %
                 (remote_image_url, local_image_name))
    # save image to gae storage
    mtype = response.headers['Content-Type']
    mname = local_image_name
    media=Media(name=mname, mtype=mtype, bits=response.content)
    media.put()
    saved_local_image_url = '/media/%s' % str(media.key())
    entry.content = entry.content.replace(local_image_name, saved_local_image_url)
    _finish_fetch_remote_image(entry)

def _handle_async_fetch_remote_image_result(rpc, entry, remote_image_url, local_image_name):
    logging.info('Response remote image in _handle_async_fetch_remote_image_result remote_image_url=%s, local_image_name=%s' %
                 (remote_image_url, local_image_name))
    response = rpc.get_result()
    if response.status_code == 200:
        _handle_succeed_fetch_remote_image(response, entry, remote_image_url, local_image_name)
    else:
        _handle_failed_fetch_remote_image(entry, local_image_name, remote_image_url,
            Exception('code=%s content=%s' % (response.status_code, response.content)))

def _create_async_rpc_callback(rpc, entry, remote_image_url, will_save_as_local_image_name):
    return lambda:_handle_async_fetch_remote_image_result(rpc, entry,
                                remote_image_url, will_save_as_local_image_name)

def _async_fetch_remote_images(entry, remote_image_urls):
    if not remote_image_urls:
        logging.info('No remote images in enrty(title=%s)' % entry.title)
        return

    logging.info('Remote images(cnt=%s) in entry(title=%s)' %
                 (len(remote_image_urls), entry.title))
    logging.info('Remote image urls: \n%s' % '\n'.join(remote_image_urls))
    setattr(entry, 'srip_remote_images', len(remote_image_urls))
    image_name_prefix = entry.slug
    if not image_name_prefix:
        image_name_prefix = uuid.uuid4()
    rpcs = []

    for remote_image_url, index in \
        zip(remote_image_urls, xrange(len(remote_image_urls))):
        will_save_as_local_image_name = '%s-%s' % (image_name_prefix, index)
        entry.content = entry.content.replace(remote_image_url, will_save_as_local_image_name)
        try:
            #_remote_image_url = copy.copy(remote_image_url),
            #_will_save_as_local_image_name = copy.copy(will_save_as_local_image_name)
            rpc = urlfetch.create_rpc(deadline=10)
            rpc.callback = _create_async_rpc_callback(rpc, entry,
                                remote_image_url, will_save_as_local_image_name)
            urlfetch.make_fetch_call(rpc, remote_image_url)
            rpcs.append(rpc)
        except Exception, e:
            _handle_failed_fetch_remote_image(entry, will_save_as_local_image_name,
                                              remote_image_url, e)
    for rpc in rpcs:
        rpc.wait()

class SaveRemoteImagesPlugin(Plugin):
    def __init__(self):
        Plugin.__init__(self, __file__)
        self.blog = None
        self.author="wade"
        self.authoruri="http://i-and-world.appspot.com"
        self.uri="http://i-and-world.appspot.com"
        self.description="""Automatically save remote images to local gae storage, it's useful when u share other's articles."""
        self.name="SaveRemoteImages"
        self.version="0.0.1"
        self.local_media_image_regex = re.compile(r"^../media", re.I)
        self.remote_domain_image_regex = re.compile(r"^http", re.I)
        self.all_images_regex = re.compile(r"<img[^>]*? src=[\'\"]?\s*([^\s\'\"]+?)[\'\"\s][^>]*>", re.I)
        self.register_action("save_post",self.handle)
        self.ignore_image_domains = OptionSet.getValue("srip_ignore_domains", [])
        self.ignore_images_regex = re.compile(r'^(%s)' % '|'.join(self.ignore_image_domains), re.I)

    def _reset_ignore_image_domains(self, ignore_domains=None):

        if ignore_domains:
            self.ignore_image_domains = [
                domain.strip() for domain in ignore_domains
                    if domain.strip()
                ]
        else:
            self.ignore_image_domains = OptionSet.getValue("srip_ignore_domains", [])

        if not self.ignore_image_domains:
            self.ignore_image_domains = [self.blog.baseurl]
        elif unicode(self.blog.baseurl) not in self.ignore_image_domains:
            self.ignore_image_domains.insert(0, self.blog.baseurl)

        logging.info('self.ignore_image_domains = %s' % r'^(%s)' % '|'.join(self.ignore_image_domains))
        self.ignore_images_regex = re.compile(r'^(%s)' % '|'.join(self.ignore_image_domains), re.I)
        OptionSet.setValue("srip_ignore_domains", self.ignore_image_domains)

    def get(self, page):
        if not self.ignore_image_domains:
            self._reset_ignore_image_domains()
        return '''<h3>Setup for your 'Save Remote Images Plugin'</h3>
                    <form action="" method="post">
                    <h3>ignore image domains</h3>
                    <span style="color:red">images from these domains will not be saved to gae.</span>
                    <b>one domain per line, the domain url should be with http/https prefix.</b><br/>
                    example:
                    <ul>%s<br/>
                    http://www.google.com<br/>
                    http://www.baidu.com</ul>
                    <textarea cols=50 rows=10 name='ignore_domains'>%s</textarea>
                    <br>
                    <input type="submit" value="submit">
                    </form>
                    Powered By <a href="http://www.i-and-world.com" target="_blank">I and the world!</a>
                    ''' % (self.blog.baseurl, '\n'.join(self.ignore_image_domains))

    def post(self ,page):
        self._reset_ignore_image_domains(list(page.param("ignore_domains").split('\r\n')))
        return self.get(page)

    def handle(self, entry, *args, **kwargs):
        remote_image_urls, has_local_media_urls = self.detect_remote_images(entry)
        if remote_image_urls:
            _async_fetch_remote_images(entry, remote_image_urls)
        elif has_local_media_urls:
            entry.put()

    def detect_remote_images(self, entry):

        all_image_urls = set(self.all_images_regex.findall(entry.content))

        remote_image_urls = []
        has_local_media_urls = False
        for image_url in all_image_urls:
            if self.local_media_image_regex.match(image_url):
                has_local_media_urls = True
                entry.content = entry.content.replace(image_url, image_url[2:])
            elif (not self.ignore_images_regex.match(image_url)) \
                and self.remote_domain_image_regex.match(image_url):
                remote_image_urls.append(image_url)

        return remote_image_urls, has_local_media_urls

save_remote_images = SaveRemoteImagesPlugin

 

Relate Posts:

9 Responses to “micolog插件-SaveRemoteImages自动保存远程图片 v0.0.1”

  1. Qi On

    是放在micolog根目录下吧?貌似没起作用阿?

    你这验证码老要输两次……

  2. Qi On

    哦,是放到plugins目录下阿?

  3. cugwei On

    just test the comment function

  4. On

    代码写的很漂亮……支持下

  5. dwmeow On

    请问对于这样的地址该如何过滤?
    支持通配符*吗?写成 http://*.static.flickr.com ??

    http://farm6.static.flickr.com/5289/5359666571_4e43b3ac80_b.jpg

    谢谢!

  6. wade On

    不支持通配符,但是支持正则表达式。
    所以对于这个链接可以使用:http://[^.]+.static.flickr.com/

  7. kycc On

    GAE不是说空间只有500M的吗,那要是快存满了怎么解决?

  8. 乌班兔 On

    GAE对文件的数量和容量是有限制的,最好使用外部相册,imgur.com这个支持API,可以无限上传图片,代码小白,不会改代码,就指望你啦,哈哈!

  9. daitem On

    There are some fascinating time limits in this article however I don’t know if I see all of them middle to heart. There may be some validity however I will take hold opinion until I look into it daitem further. Good article , thanks and we would like more! Added to FeedBurner as effectively

Leave a Reply