Source code for spyder.processor.scoper

#
# Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
#
# scoper.py 24-Jan-2011
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
The *Crawl Scope* defines which *URLs* the *Spyder* should process. The main
usecases for them are:

- only spider content from the *Seed* Hosts
- do not spider images, css, videos

and there are probably a lot of other reasons you want to have at least one the
scoper configured, otherwise you might end up downloading the internet.

So each scoper should iterate over the
``curi.optional_vars[CURI_EXTRACTED_URLS]`` and determine if it should be
downloaded or not.

The :class:`RegexScoper` maintains a list of regular expressions that define
the crawl scope. Two classes of expressions exist: positive and negative.
The initial decision of the scoper is to not download its content. If a regex
from the positive list matches, and no regex from the negative list matches,
the *URL* is marked for downloading. In any other case, the *URL* will be
abandoned.

.. note:: We should really split up the regex scoper and allow the user to
    configure more than just one scoper.
"""

import re

from spyder.core.constants import CURI_EXTRACTED_URLS


[docs]class RegexScoper(object):
    """
    The scoper based on regular expressions.

    There are two settings that influence this scoper:

    1. ``settings.REGEX_SCOPE_POSITIVE``
    2. ``settings.REGEX_SCOPE_NEGATIVE``

    Both have to be a ``list``. The scoper is executed in the
    :meth:`__call__` method.
    """

    def __init__(self, settings):
        """
        Compile the regular expressions.
        """
        self._positive_regex = []
        for regex in settings.REGEX_SCOPE_POSITIVE:
            self._positive_regex.append(re.compile(regex))

        self._negative_regex = []
        for regex in settings.REGEX_SCOPE_NEGATIVE:
            self._negative_regex.append(re.compile(regex))

    def __call__(self, curi):
        """
        Filter all newly extracted URLs for those we want in this crawl.
        """
        if CURI_EXTRACTED_URLS not in curi.optional_vars:
            return curi

        urls = []
        for url in curi.optional_vars[CURI_EXTRACTED_URLS].split("\n"):
            add_url = False
            for regex in self._positive_regex:
                if regex.match(url):
                    add_url = True

            for regex in self._negative_regex:
                if regex.match(url):
                    add_url = False

            if add_url:
                urls.append(url)

        curi.optional_vars[CURI_EXTRACTED_URLS] = "\n".join(urls)
        return curi
Navigation

Source code for spyder.processor.scoper

Quick search

Navigation