02月23, 2012

使用PyV8解析HTML文档

什么是PyV8?

PyV8是一个Python封装V8引擎的壳。它提供了简单可用的API,能够利用python来构建出JavaScript的运行时环境。

PyV8能用来干什么?

在nodejs火热流行的时代,或许很少人关注这个基于python简单封装的v8引擎。在某些方面,它比nodejs简洁,而它们拥有同样的本质基础,使得它具有和nodejs相似的潜力。

既然是基于v8的,那么利用它来解析dom和执行javascript是理所当然的。试想一下,如果我们能够建立一个系统把html文档结构解析成w3c dom树,那么我们就可以在上面运行任何javascript,使得这个系统事实上成为一个浏览器(尽管缺少渲染和显示的部分)。

不过,pyv8提供的是纯js运行环境,因此,要解析html文档,还需要为它构建出w3c dom以及浏览器的环境。实现dom和浏览器接口毫无疑问是件力气活,幸运的是,pyv8的官方demo中为我们做了80%以上的工作,提供了w3c.py和browser.py的支持!我们所要做的是在这基础上做一些基本的修改。

好吧,让我们来实践一下:

首先我们构建一个基本的js运行环境,参考commonjs的api实现js基础的模块加载和执行。这一部分还不涉及dom和浏览器,基本上是js原生环境。

import os, re, platform
from PyV8 import JSContext, JSError

from logger import logger

class CommonJS():
    _js_path = [os.path.dirname(__file__), os.path.join(os.path.dirname(__file__),"core")]
    _js_logger = logger().instance()

    def __init__(self):
        self._js_threadlock = False
        self._js_ctx = JSContext(self)
        self._js_modules = {}
        self._loaded_modules = {}

        for jsroot in CommonJS._js_path:
            for (root, dirs, files) in os.walk(jsroot):
                for _file in files:
                    m = re.compile("(.*)\.js$").match(_file)
                    relpath = os.path.relpath(root, jsroot)
                    namespace = re.sub(r"^\.", "", relpath)
                    namespace = re.sub(r"^\\", "/", namespace)
                    if(namespace):
                        namespace = namespace + "/"
                    if(m):
                        self._js_modules.update({namespace + m.group(1) : os.path.join(root,_file)})

        self.execute("var exports;");               

    @classmethod
    def append(path):
        if(path not in CommonJS._js_path):
            CommonJS._js_path.append(path)

    def require(self, module):
        if(not self._js_modules.has_key(module)):
            raise Exception, "unknown module `" + module + "`"
        path = self._js_modules[module]

        if(not self._loaded_modules.has_key(path)):
            self._js_logger.info("loading module <%s>...", module)
            code = file(path).read()
            try:
                code = code.decode("utf-8")
                if(platform.system() == "Windows"):
                    code = code.encode("utf-8")
                self._js_ctx.eval(code)
            except JSError, ex:
                self._js_logger.error(ex)
                self._js_logger.debug(ex.stackTrace)
                raise Exception, ex
            self._loaded_modules[path] = self._js_ctx.locals.exports
            return self._loaded_modules[path]
        else:
            return self._loaded_modules[path]

    def execute(self, code, args = []):
        self._js_ctx.enter()

        # use lock while jscode executing to make mutil-thread work
        while self._js_threadlock:
            pass
        self._js_threadlock = True
        try:
            if(isinstance(code, basestring)):
                code = code.decode("utf-8")
                if(platform.system() == "Windows"):
                    code = code.encode("utf-8")
                r = self._js_ctx.eval(code)
            else:
                r = apply(code, args)
            return r
        except JSError, ex:
            self._js_logger.error(ex)
            self._js_logger.debug(ex.stackTrace)
            raise Exception, ex
        finally:
            self._js_threadlock = False
            self._js_ctx.leave()

好了,我们可以测试一下:

from commonjs import CommonJS
ctx = CommonJS()
ctx.append("/my/js/path")
print ctx.execute("require("base"); QW.provide("Test",{}); exports = QW.Test");

结果是 [JSObject]

在这基础上,我们进一步实现整个运行时环境和一些内置方法:

#JavaScript HTML Context for PyV8

from PyV8 import JSClass
from logger import logger
import re, threading, hashlib

import urllib,urllib2

from w3c import parseString, Document, HTMLElement
from commonjs import CommonJS

import browser

from StringIO import StringIO
import gzip

class JSR(CommonJS, browser.HtmlWindow):
    def __init__(self, url_or_dom, charset=None, headers={}, body={}, timeout=2):
        urllib2.socket.setdefaulttimeout(timeout)
        jsonp = False

        if(isinstance(url_or_dom, Document)):
            url = "localhost:document"
            dom = url_or_dom

        elif(url_or_dom.startswith("<")):
            url = "localhost:string"
            dom = parseString(url_or_dom)

        else: #url
            url = url_or_dom
            if(not re.match(r"\w+\:\/\/", url)):
                url = "http://" + url

            request = urllib2.Request(url, urllib.urlencode(body), headers=headers) 
            response = urllib2.urlopen(url)

            contentType = response.headers.get("Content-Type")

            if(contentType):
                #print contentType
                t = re.search(r"x-javascript|json", contentType)
                if(t):
                    jsonp = True
                m = re.match(r"^.*;\s*charset=(.*)$", contentType)
                if(m):
                    charset = m.group(1) 
                #print charset

            if(not charset):
                charset = "utf-8" #default charset
                # guess charset from httpheader

            html = response.read()
            encoding = response.headers.get("Content-Encoding")

            if(encoding and encoding == "gzip"):
                buf = StringIO(html)
                f = gzip.GzipFile(fileobj=buf)
                html = f.read()   

            self.__html__ = html
            html = unicode(html, encoding=charset, errors="ignore")
            dom = parseString(html)   

        navigator = browser.matchNavigator(headers.get("User-Agent") or "")

        browser.HtmlWindow.__init__(self, url, dom, navigator)
        CommonJS.__init__(self)

        self.console = JSConsole(self._js_logger)

        for module in "base, array.h, function.h, helper.h, object.h, string.h, date.h, custevent, selector, dom_retouch".split(","):
            self.execute(self.require, [module.strip()])

        if(jsonp):
            code = "window.data=" + html.encode("utf-8")
            self.execute(code)
            #print code

        self._js_logger.info("JavaScript runtime ready.")

    _js_timer_map = {}

    def _js_execTimer(self, id, callback, delay, repeat = False):
        code = "(function f(){ _js_timers[" + str(id) + "][1].code();"
        if(repeat):
            code = code + "_js_execTimer(" + str(id) + ", f, " + str(delay) + ", true);"
        code = code + "})();"

        #thread locking
        self._js_timer_map[id] = threading.Timer(delay / 1000.0, lambda: self.execute(code))
        self._js_timer_map[id].start()

    def setTimeout(self, callback, delay):
        timerId = super(JSR, self).setTimeout(callback, delay)
        self._js_execTimer(timerId, callback, delay, False)
        return timerId

    def clearTimeout(self, timerId):
        if(timerId in self._js_timer_map):
            self._js_timer_map[timerId].cancel()
            self._js_timer_map[timerId] = None
            super(JSR, self).clearTimeout(timerId)

    def setInterval(self, callback, delay):
        timerId = super(JSR, self).setInterval(callback, delay)
        self._js_execTimer(timerId, callback, delay, True)
        return timerId       

    def clearInterval(self, timerId):
        if(timerId in self._js_timer_map):
            self._js_timer_map[timerId].cancel()
            self._js_timer_map[timerId] = None
            super(JSR, self).clearTimeout(timerId)

    def md5(self, str):
        return hashlib.md5(str).hexdigest()

class JSConsole(JSClass):
    def __init__(self, logger):
        self._js_logger = logger
    def log(self, msg):
        self._js_logger.info(str(msg).decode("utf-8"))

到此为止,一个简单的运行时环境就实现了。

使用方法:

from jsruntime import JSR

rt = JSR("www.baidu.com")
rt.execute("alert(document.title)") #结果是“百度一下,你就知道”
rt.execute("document.querySelector("body div")") #获得body下匹配的第一个div
...

利用它,你就可以用js来处理抓取的页面元素,甚至通过运行页面上的js来获取动态加载的内容,只要继续扩展,可以用它来实现一个相当不错的页面内容采集服务,而采集规则,不再只能是简单的正则,而是可以用js像处理任何动态网页那样来处理抓取的页面。

有兴趣滴童鞋可以研究下,pyv8的潜力还是很大的,大家一起期待一下~

本文链接:https://www.h5jun.com/post/PyV8.html

-- EOF --

Comments