Source code for spyder.processor.scoper
#
# Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
#
# scoper.py 24-Jan-2011
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
The *Crawl Scope* defines which *URLs* the *Spyder* should process. The main
usecases for them are:
- only spider content from the *Seed* Hosts
- do not spider images, css, videos
and there are probably a lot of other reasons you want to have at least one the
scoper configured, otherwise you might end up downloading the internet.
So each scoper should iterate over the
``curi.optional_vars[CURI_EXTRACTED_URLS]`` and determine if it should be
downloaded or not.
The :class:`RegexScoper` maintains a list of regular expressions that define
the crawl scope. Two classes of expressions exist: positive and negative.
The initial decision of the scoper is to not download its content. If a regex
from the positive list matches, and no regex from the negative list matches,
the *URL* is marked for downloading. In any other case, the *URL* will be
abandoned.
.. note:: We should really split up the regex scoper and allow the user to
configure more than just one scoper.
"""
import re
from spyder.core.constants import CURI_EXTRACTED_URLS
[docs]class RegexScoper(object):
"""
The scoper based on regular expressions.
There are two settings that influence this scoper:
1. ``settings.REGEX_SCOPE_POSITIVE``
2. ``settings.REGEX_SCOPE_NEGATIVE``
Both have to be a ``list``. The scoper is executed in the
:meth:`__call__` method.
"""
def __init__(self, settings):
"""
Compile the regular expressions.
"""
self._positive_regex = []
for regex in settings.REGEX_SCOPE_POSITIVE:
self._positive_regex.append(re.compile(regex))
self._negative_regex = []
for regex in settings.REGEX_SCOPE_NEGATIVE:
self._negative_regex.append(re.compile(regex))
def __call__(self, curi):
"""
Filter all newly extracted URLs for those we want in this crawl.
"""
if CURI_EXTRACTED_URLS not in curi.optional_vars:
return curi
urls = []
for url in curi.optional_vars[CURI_EXTRACTED_URLS].split("\n"):
add_url = False
for regex in self._positive_regex:
if regex.match(url):
add_url = True
for regex in self._negative_regex:
if regex.match(url):
add_url = False
if add_url:
urls.append(url)
curi.optional_vars[CURI_EXTRACTED_URLS] = "\n".join(urls)
return curi