pistolero · vetal4444 · Aug 1, 2014 · Aug 1, 2014 · Aug 1, 2014 · Aug 3, 2014
diff --git a/.gitignore b/.gitignore
@@ -8,5 +8,6 @@ env/
 *~
 .idea
 ._*
+*.egg
 venv/
 goose_extractor.egg-info/
diff --git a/goose/__init__.py b/goose/__init__.py
@@ -59,8 +59,14 @@ def shutdown_network(self):
         pass
 
     def crawl(self, crawl_candiate):
-        crawler = Crawler(self.config)
-        article = crawler.crawl(crawl_candiate)
+        parsers = list(self.config.available_parsers)
+        parsers.remove(self.config.parser_class)
+        try:
+            crawler = Crawler(self.config)
+            article = crawler.crawl(crawl_candiate)
+        except (UnicodeDecodeError, ValueError):
+            self.config.parser_class = parsers[0]
+            return self.crawl(crawl_candiate)
         return article
 
     def initialize(self):

diff --git a/goose/article.py b/goose/article.py
@@ -26,7 +26,7 @@ class Article(object):
 
     def __init__(self):
         # title of the article
-        self.title = None
+        self.title = u""
 
         # stores the lovely, pure text from the article,
         # stripped of html, formatting, etc...
@@ -62,12 +62,24 @@ def __init__(self):
 
         # holds a set of tags that may have
         # been in the artcle, these are not meta keywords
-        self.tags = set()
+        self.tags = []
+
+        # holds a dict of all opengrah data found
+        self.opengraph = {}
+
+        # holds twitter embeds
+        self.tweets = []
 
         # holds a list of any movies
         # we found on the page like youtube, vimeo
         self.movies = []
 
+        # holds links found in the main article
+        self.links = []
+
+        # hold author names
+        self.authors = []
+
         # stores the final URL that we're going to try
         # and fetch content against, this would be expanded if any
         self.final_url = u""
@@ -94,3 +106,48 @@ def __init__(self):
 
         # A property bucket for consumers of goose to store custom data extractions.
         self.additional_data = {}
+
+    @property
+    def infos(self):
+        data = {
+            "meta": {
+                "description": self.meta_description,
+                "lang": self.meta_lang,
+                "keywords": self.meta_keywords,
+                "favicon": self.meta_favicon,
+                "canonical": self.canonical_link,
+            },
+            "image": None,
+            "domain": self.domain,
+            "title": self.title,
+            "cleaned_text": self.cleaned_text,
+            "opengraph": self.opengraph,
+            "tags": self.tags,
+            "tweets": self.tweets,
+            "movies": [],
+            "links": self.links,
+            "authors": self.authors,
+            "publish_date": self.publish_date
+        }
+
+        # image
+        if self.top_image is not None:
+            data['image'] = {
+                'url': self.top_image.src,
+                'width': self.top_image.width,
+                'height': self.top_image.height,
+                'type': 'image'
+            }
+
+        # movies
+        for movie in self.movies:
+            data['movies'].append({
+                'embed_type': movie.embed_type,
+                'provider': movie.provider,
+                'width': movie.width,
+                'height': movie.height,
+                'embed_code': movie.embed_code,
+                'src': movie.src,
+            })
+
+        return data
diff --git a/goose/cleaners.py b/goose/cleaners.py
@@ -246,7 +246,8 @@ def div_to_para(self, doc, dom_type):
                 bad_divs += 1
             elif div is not None:
                 replaceNodes = self.get_replacement_nodes(doc, div)
-                div.clear()
+                for child in self.parser.childNodes(div):
+                    div.remove(child)
 
                 for c, n in enumerate(replaceNodes):
                     div.insert(c, n)

diff --git a/goose/configuration.py b/goose/configuration.py
@@ -29,6 +29,11 @@
 
 HTTP_DEFAULT_TIMEOUT = 30
 
+AVAILABLE_PARSERS = {
+    'lxml': Parser,
+    'soup': ParserSoup,
+}
+
 
 class Configuration(object):
 
@@ -84,6 +89,7 @@ def __init__(self):
         self.additional_data_extractor = None
 
         # Parser type
+        self.available_parsers = AVAILABLE_PARSERS.keys()
         self.parser_class = 'lxml'
 
         # set the local storage path
@@ -94,19 +100,7 @@ def __init__(self):
         self.http_timeout = HTTP_DEFAULT_TIMEOUT
 
     def get_parser(self):
-        return Parser if self.parser_class == 'lxml' else ParserSoup
-
-    def get_publishdate_extractor(self):
-        return self.extract_publishdate
-
-    def set_publishdate_extractor(self, extractor):
-        """\
-        Pass in to extract article publish dates.
-        @param extractor a concrete instance of PublishDateExtractor
-        """
-        if not extractor:
-            raise ValueError("extractor must not be null!")
-        self.extract_publishdate = extractor
+        return AVAILABLE_PARSERS[self.parser_class]
 
     def get_additionaldata_extractor(self):
         return self.additional_data_extractor

diff --git a/goose/crawler.py b/goose/crawler.py
@@ -25,11 +25,20 @@
 from copy import deepcopy
 from goose.article import Article
 from goose.utils import URLHelper, RawHelper
-from goose.extractors import StandardContentExtractor
+from goose.extractors.content import StandardContentExtractor
+from goose.extractors.videos import VideoExtractor
+from goose.extractors.title import TitleExtractor
+from goose.extractors.images import ImageExtractor
+from goose.extractors.links import LinksExtractor
+from goose.extractors.tweets import TweetsExtractor
+from goose.extractors.authors import AuthorsExtractor
+from goose.extractors.tags import TagsExtractor
+from goose.extractors.opengraph import OpenGraphExtractor
+from goose.extractors.publishdate import PublishDateExtractor
+from goose.extractors.metas import MetasExtractor
 from goose.cleaners import StandardDocumentCleaner
 from goose.outputformatters import StandardOutputFormatter
-from goose.images.extractors import UpgradedImageIExtractor
-from goose.videos.extractors import VideoExtractor
+
 from goose.network import HtmlFetcher
 
 
@@ -63,9 +72,33 @@ def __init__(self, config):
         # init the output formatter
         self.formatter = self.get_formatter()
 
+        # metas extractor
+        self.metas_extractor = self.get_metas_extractor()
+
+        # publishdate extractor
+        self.publishdate_extractor = self.get_publishdate_extractor()
+
+        # opengraph extractor
+        self.opengraph_extractor = self.get_opengraph_extractor()
+
+        # tags extractor
+        self.tags_extractor = self.get_tags_extractor()
+
+        # authors extractor
+        self.authors_extractor = self.get_authors_extractor()
+
+        # tweets extractor
+        self.tweets_extractor = self.get_tweets_extractor()
+
+        # links extractor
+        self.links_extractor = self.get_links_extractor()
+
         # video extractor
         self.video_extractor = self.get_video_extractor()
 
+        # title extractor
+        self.title_extractor = self.get_title_extractor()
+
         # image extrator
         self.image_extractor = self.get_image_extractor()
 
@@ -95,17 +128,37 @@ def crawl(self, crawl_candidate):
         self.article.raw_html = raw_html
         self.article.doc = doc
         self.article.raw_doc = deepcopy(doc)
-        # TODO
-        # self.article.publish_date = config.publishDateExtractor.extract(doc)
-        # self.article.additional_data = config.get_additionaldata_extractor.extract(doc)
-        self.article.title = self.extractor.get_title()
-        self.article.meta_lang = self.extractor.get_meta_lang()
-        self.article.meta_favicon = self.extractor.get_favicon()
-        self.article.meta_description = self.extractor.get_meta_description()
-        self.article.meta_keywords = self.extractor.get_meta_keywords()
-        self.article.canonical_link = self.extractor.get_canonical_link()
-        self.article.domain = self.extractor.get_domain()
-        self.article.tags = self.extractor.extract_tags()
+
+        # open graph
+        self.article.opengraph = self.opengraph_extractor.extract()
+
+        # publishdate
+        self.article.publish_date = self.publishdate_extractor.extract()
+
+        # meta
+        metas = self.metas_extractor.extract()
+        self.article.meta_lang = metas['lang']
+        self.article.meta_favicon = metas['favicon']
+        self.article.meta_description = metas['description']
+        self.article.meta_keywords = metas['keywords']
+        self.article.canonical_link = metas['canonical']
+        self.article.domain = metas['domain']
+
+        # tags
+        self.article.tags = self.tags_extractor.extract()
+
+        # authors
+        self.article.authors = self.authors_extractor.extract()
+
+        # title
+        self.article.title = self.title_extractor.extract()
+
+        # check for known node as content body
+        # if we find one force the article.doc to be the found node
+        # this will prevent the cleaner to remove unwanted text content
+        article_body = self.extractor.get_known_article_tags()
+        if article_body is not None:
+            self.article.doc = article_body
 
         # before we do any calcs on the body itself let's clean up the document
         self.article.doc = self.cleaner.clean()
@@ -117,10 +170,16 @@ def crawl(self, crawl_candidate):
         # let's process it
         if self.article.top_node is not None:
 
-            # video handeling
+            # article links
+            self.article.links = self.links_extractor.extract()
+
+            # tweets
+            self.article.tweets = self.tweets_extractor.extract()
+
+            # video handling
             self.video_extractor.get_videos()
 
-            # image handeling
+            # image handling
             if self.config.enable_image_fetching:
                 self.get_image()
 
@@ -160,8 +219,32 @@ def get_html(self, crawl_candidate, parsing_candidate):
             })
         return html
 
+    def get_metas_extractor(self):
+        return MetasExtractor(self.config, self.article)
+
+    def get_publishdate_extractor(self):
+        return PublishDateExtractor(self.config, self.article)
+
+    def get_opengraph_extractor(self):
+        return OpenGraphExtractor(self.config, self.article)
+
+    def get_tags_extractor(self):
+        return TagsExtractor(self.config, self.article)
+
+    def get_authors_extractor(self):
+        return AuthorsExtractor(self.config, self.article)
+
+    def get_tweets_extractor(self):
+        return TweetsExtractor(self.config, self.article)
+
+    def get_links_extractor(self):
+        return LinksExtractor(self.config, self.article)
+
+    def get_title_extractor(self):
+        return TitleExtractor(self.config, self.article)
+
     def get_image_extractor(self):
-        return UpgradedImageIExtractor(self.config, self.article)
+        return ImageExtractor(self.config, self.article)
 
     def get_video_extractor(self):
         return VideoExtractor(self.config, self.article)

diff --git a/goose/extractors/__init__.py b/goose/extractors/__init__.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+"""\
+This is a python port of "Goose" orignialy licensed to Gravity.com
+under one or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.
+
+Python port was written by Xavier Grangier for Recrutae
+
+Gravity.com licenses this file
+to you under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+
+class BaseExtractor(object):
+
+    def __init__(self, config, article):
+        # config
+        self.config = config
+
+        # parser
+        self.parser = self.config.get_parser()
+
+        # article
+        self.article = article
+
+        # stopwords class
+        self.stopwords_class = config.stopwords_class
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,5 +8,6 @@ env/ @@
     *~
     .idea
     ._*
+    *.egg
     venv/
     goose_extractor.egg-info/