Source code for spyder.processor.htmllinkextractor

#
# Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
#
# htmlextractor.py 21-Jan-2011
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
The :class:`DefaultHtmlLinkExtractor` will try to extract new links from the
``curi.content_body``. In order to find them two regular expressions are used.

1. The ``RELEVANT_TAG_EXTRACTOR`` extracts the following tags:
    - ``<script>..</script>``
    - ``<style>..</style>``
    - ``<meta>``
    - or any other open tag with at least one attribute (e.g. not ``<br>``).

2. The ``LINK_EXTRACTOR`` extracts links from tags using `href` or `src`
attributes.

If the link is relative, the appropriate prefix is automatically added here.

The regular expressions have been adopted from Heritrix. See the Heritrix 3
source code:

``modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java``

.. note:: Heritrix has a newer way of extracting links, i.e. with different
    regular expressions. Since these are working for me at the moment, I am
    fine with it.
"""
import re
import htmlentitydefs

import urlparse

from spyder.core.constants import CURI_EXTRACTED_URLS
from spyder.core.constants import CURI_OPTIONAL_TRUE, CURI_EXTRACTION_FINISHED
from spyder.encoding import get_content_type_encoding

# Maximum number of chars an element name may have
MAX_ELEMENT_REPLACE = "MAX_ELEMENT_REPLACE"

# Pattern for extracting relevant tags from HTML
#
# This pattern extracts:
#  1: <script>...</script>
#  2: <style>...</style>
#  3: <meta...>
#  4: any other open tag with at least one attribute
#     (eg matches "<a href='boo'>" but not "</a>" or "<br>")
#
# Groups in this pattern:
#
#  1: script src=foo>boo</script
#  2: just the script open tag
#  3: style type=moo>zoo</style
#  4: just the style open tag
#  5: entire other tag, without '<' '>'
#  6: element
#  7: meta
#  8: !-- comment --
RELEVANT_TAG_EXTRACTOR = "<(?:((script[^>]*)>[^(</script)]*</script)" + "|" + \
    "((style[^/]*)>[^(</style)]*</style)" + "|" + \
    "(((meta)|(?:\\w{1,MAX_ELEMENT_REPLACE}))\\s+[^>]*)" + "|" + \
    "(!--.*?--))>"


# The simpler pattern to extract links from tags
#
# Groups in this expression:
#
#  1: the attribute name
#  2: href | src
#  3: the url in quotes
LINK_EXTRACTOR = "(\w+)[^>]*?(?:(href|src))\s*=\s*" + \
    "(?:(\"[^\"]+\"|'[^']+'))"


[docs]class DefaultHtmlLinkExtractor(object):
    """
    The default extractor for Links from HTML pages.

    The internal regular expressions currently are not modifiable. Only the
    maximum length of an opening tag can be configured using the
    ``settings.REGEX_LINK_XTRACTOR_MAX_ELEMENT_LENGTH``.
    """

    def __init__(self, settings):
        """
        Initialize the regular expressions.
        """
        max_size = settings.REGEX_LINK_XTRACTOR_MAX_ELEMENT_LENGTH
        self._tag_extractor = re.compile(
                RELEVANT_TAG_EXTRACTOR.replace(MAX_ELEMENT_REPLACE,
                    str(max_size)), re.I | re.S)

        self._link_extractor = re.compile(LINK_EXTRACTOR, re.I | re.S)
        self._base_url = ""

    def __call__(self, curi):
        """
        Actually extract links from the html content if the content type
        matches.
        """
        if not self._restrict_content_type(curi):
            return curi

        if CURI_EXTRACTION_FINISHED in curi.optional_vars and \
            curi.optional_vars[CURI_EXTRACTION_FINISHED] == CURI_OPTIONAL_TRUE:
            return curi

        (_type, encoding) = get_content_type_encoding(curi)

        try:
            content = curi.content_body.decode(encoding)
        except Exception:
            content = curi.content_body

        parsed_url = urlparse.urlparse(curi.url)
        self._base_url = curi.url

        # iterate over all tags
        for tag in self._tag_extractor.finditer(content):

            if tag.start(8) > 0:
                # a html comment, ignore
                continue

            elif tag.start(7) > 0:
                # a meta tag
                curi = self._process_meta(curi, parsed_url, content,
                        (tag.start(5), tag.end(5)))

            elif tag.start(5) > 0:
                # generic <whatever tag
                curi = self._process_generic_tag(curi, parsed_url, content,
                        (tag.start(6), tag.end(6)),
                        (tag.start(5), tag.end(5)))

            elif tag.start(1) > 0:
                # <script> tag
                # TODO no script handling so far
                pass

            elif tag.start(3) > 0:
                # <style> tag
                # TODO no tag handling so far
                pass

        return curi

    def _process_generic_tag(self, curi, parsed_url, content,
            element_name_tuple, element_tuple):
        """
        Process a generic tag.

        This can be anything but `meta`, `script` or `style` tags.

        `content` is the decoded content body.
        `element_name_tuple` is a tuple containing (start,end) integers of
            the current tag name.
        `element_tuple` is a tuple containing (start,end) integers of the
            current element
        """
        (start, end) = element_name_tuple
        el_name = content[start:end]
        if "a" == el_name.lower():
            curi = self._extract_links(curi, parsed_url, content,
                    element_tuple)
        elif "base" == el_name.lower():
            self._base_url = self._get_links(content, element_tuple)[0]

        return curi

    def _get_links(self, content, element_tuple):
        """
        Do the actual link extraction and return the list of links.

        `content` is the decoded content body.
        `element_tuple` is a tuple containing (start,end) integers of the
            current element
        """
        links = []
        (start, end) = element_tuple
        element = self._unescape_html(content[start:end])

        for link_candidate in self._link_extractor.finditer(element):
            link = link_candidate.group(3)[1:-1]
            if link.find("mailto:") > -1 or link.find("javascript:") > -1:
                continue
            if link.find("://") == -1:
                link = urlparse.urljoin(self._base_url, link)
            links.append(link)

        return links

    def _extract_links(self, curi, parsed_url, content, element_tuple):
        """
        Extract links from an element, e.g. href="" attributes.
        """
        links = self._get_links(content, element_tuple)

        linkstring = "\n".join(links).encode('ascii', 'replace')
        if not CURI_EXTRACTED_URLS in curi.optional_vars:
            curi.optional_vars[CURI_EXTRACTED_URLS] = linkstring
        else:
            curi.optional_vars[CURI_EXTRACTED_URLS] += "\n" + linkstring

        return curi

    def _process_meta(self, curi, _parsed_url, _content, _element_tuple):
        """
        Process a meta tag.
        """
        return curi

    def _restrict_content_type(self, curi):
        """
        Decide based on the `CrawlUri`s Content-Type whether we want to process
        it.
        """
        allowed = ["text/html", "application/xhtml", "text/vnd.wap.wml",
            "application/vnd.wap.wml", "application/vnd.wap.xhtm"]
        (ctype, _enc) = get_content_type_encoding(curi)
        return ctype in allowed

    def _unescape_html(self, link):
        """
        Unescape the link.

        keep &amp;, &gt;, &lt; in the source code.

        http://effbot.org/zone/re-sub.htm#unescape-html
        """
        def fixup(m):
            text = m.group(0)
            if text[:2] == "&#":
                # character reference
                try:
                    if text[:3] == "&#x":
                        return unichr(int(text[3:-1], 16))
                    else:
                        return unichr(int(text[2:-1]))
                except ValueError:
                    pass
            else:
                # named entity
                try:
                    text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
                except KeyError:
                    pass
            return text
        return re.sub("&#?\w+;", fixup, link)
Navigation

Source code for spyder.processor.htmllinkextractor

Quick search

Navigation