Source code for spyder.core.sink

#
# Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
#
# sink.py 02-Feb-2011
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
A sink of :class:`CrawlUri`.
"""


[docs]class AbstractCrawlUriSink(object): """ Abstract sink. Only overwrite the methods you are interested in. """
[docs] def process_successful_crawl(self, curi): """ We have crawled a uri successfully. If there are newly extracted links, add them alongside the original uri to the frontier. """ pass
[docs] def process_not_found(self, curi): """ The uri we should have crawled was not found, i.e. HTTP Error 404. Do something with that. """ pass
[docs] def process_redirect(self, curi): """ There have been too many redirects, i.e. in the default config there have been more than 3 redirects. """ pass
[docs] def process_server_error(self, curi): """ There has been a server error, i.e. HTTP Error 50x. Maybe we should try to crawl this uri again a little bit later. """ pass
[docs]class CouchDbSink(object): """ Simple sink that will store :class:`CrawlUri` inside a CouchDB instance. """ def __init__(self, host_port="127.0.0.1:5984", database="spyder"): """ Initialize the HTTP Client. """ pass