Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
115 commits
Select commit Hold shift + click to select a range
a7898ac
Fix minor spelling mistake
MichaelAquilina Aug 1, 2014
b6a54f9
Use PEP8 convention for boolean statements
MichaelAquilina Aug 1, 2014
2916126
Do not fail when stopword list is not available for a certain language
MichaelAquilina Aug 1, 2014
b933004
Fix minor spelling error "handling"
MichaelAquilina Aug 3, 2014
c31c9c4
#157 - add test case files
Dec 28, 2014
5ac4a32
#157 - remove childnode one by one to keep parent node
Dec 28, 2014
71f1dec
#157 - hanbdle schema.org microdata
Dec 28, 2014
6215ffa
#157 - add test case
Dec 28, 2014
b8991df
#157 - remove print
Dec 28, 2014
9379cd8
#157 - corrected content with microdata
Dec 28, 2014
f28a6e7
#157 - refactor
Dec 28, 2014
f7ccb2e
Merge branch 'feature/open-graph-157' into develop
Dec 28, 2014
ced075f
#160 - fail silently for unknown images
Dec 29, 2014
048bcdb
#161 - add parser list variable
Dec 29, 2014
eaaa60a
#161 - parser fallback
Dec 29, 2014
4a430c8
Merge branch 'feature/parser-fallback-161' into develop
Dec 29, 2014
f6647fc
Merge pull request #5 from cronycle/feature/4-publish-date
Dec 29, 2014
2498065
#163 - add schema published date parsing test
Dec 29, 2014
5910f39
#163 - do not use only meta for publication date
Dec 29, 2014
31cf754
Merge branch 'feature/publish-date-163' into develop
Dec 29, 2014
f8fc13d
#165 - add opengraph property to article
Dec 29, 2014
a27cfff
#165 - extract opengraph data
Dec 29, 2014
101e69c
#165 - opengraph extractor
Dec 29, 2014
eb1274b
#165 - rename dict
Dec 29, 2014
c2eb34e
#165 - opengraph extraction test
Dec 29, 2014
6bbe2db
#165 - rename article body extraction test
Dec 29, 2014
517eb67
Merge branch 'feature/opengraph-165' into develop
Dec 29, 2014
124371e
#139 - article links property
Dec 29, 2014
4adf4bc
#139 - extract article links
Dec 29, 2014
0a3303e
#139 - article links extract method
Dec 29, 2014
8a4ecf2
Merge branch 'feature/links-139' into develop
Dec 29, 2014
cda2ef6
#142 - extract authors
Dec 29, 2014
675c077
#169 - extract tweets
Dec 29, 2014
af49328
#169 - extract tweets
Dec 29, 2014
27709a4
Merge branch 'feature/extract-twitter-169' into develop
Dec 29, 2014
90b3cac
Replaced bare except with except Exception
pistolero Dec 29, 2014
90b041d
#171 - do not increment version yet
Dec 29, 2014
21a67ca
Merge branch 'pistolero-develop' into develop
Dec 29, 2014
848acf8
#172 - tweet extraction tests
Dec 29, 2014
321fb86
#173 - authors extraction test case
Dec 30, 2014
989ab24
#175 - links extraction tests
Dec 30, 2014
96caa3c
#177 - tags are a list
Dec 30, 2014
6338f68
#177 - title is empty string by default
Dec 30, 2014
206f6e2
#177 - info method return article data as dict
Dec 30, 2014
37e24b2
#177 - add top image to returned dict
Dec 30, 2014
6f00464
Merge branch 'feature/article-infos-177' into develop
Dec 30, 2014
e452c23
#129 - add issue test case
Dec 30, 2014
a36b5a8
#129 - force articleBody to be the document root if found
Dec 30, 2014
502053d
Merge branch 'feature/empty-content-129' into develop
Dec 30, 2014
b04f1e9
#137 - opengraph title test case
Dec 30, 2014
655aca6
#137 - test separator
Dec 30, 2014
3ff269e
#137 - use og:title in test case
Dec 30, 2014
d31112b
#137 - corrected title
Dec 30, 2014
0e370dc
#137 - corrected title
Dec 30, 2014
66b63fc
#137 - fetch opengraph before title
Dec 30, 2014
bd96c94
#137 - refactor title extraction based on opengraph, meta headling an…
Dec 30, 2014
148ce9b
#137 - more explicit error message
Dec 30, 2014
4fd94de
Merge branch 'feature/title-extraction-137' into develop
Dec 30, 2014
e404f1b
#115 - remove businessinsider tests case due to no valid html
Dec 30, 2014
b5ddaf1
#115 - add issue 115 test files
Dec 30, 2014
c7ec678
#115 - use known content tags to be article main body
Dec 30, 2014
b70075a
Merge branch 'feature/extract-115' into develop
Dec 30, 2014
8d18a8e
Merge branch 'develop' of https://github.com/KillaW0lf04/python-goose…
Dec 30, 2014
413037f
Merge branch 'KillaW0lf04-develop' into develop
Dec 30, 2014
0e6201d
#81 - use correct language for stopwords file
Dec 30, 2014
4632df7
#182 - rename soup parser
Dec 30, 2014
fe5f5e9
#183 - pep8
Dec 30, 2014
22ded4b
#183 - use article tag for a top node
Dec 30, 2014
ce6d8a1
Merge branch 'feature/extract-183' into develop
Dec 30, 2014
57b1534
#185 - movies info
Dec 30, 2014
4eda345
bump version
Dec 30, 2014
066a3c0
Merge branch 'release/1.0.23'
Dec 30, 2014
bcfb9f3
Merge branch 'release/1.0.23' into develop
Dec 30, 2014
dd33aab
ignore egg files
Dec 30, 2014
3ebc97c
#187 - empty list
Dec 30, 2014
8eccabf
#188 - mv article extractor to extractors directory
Dec 31, 2014
731f104
#188 - create a base extractor class
Dec 31, 2014
6ef3f68
#188 - contentextractor inherits form baseextractor
Dec 31, 2014
bcf4654
#188 - create specific extractors classes
Dec 31, 2014
cbbfba3
#188 - add tags and author extractors
Dec 31, 2014
a957931
#188 - correct import
Dec 31, 2014
8d6d49e
#188 - move video to extractor directory
Dec 31, 2014
ab81954
#188 - move images extractor to extractors dir and correct videos
Dec 31, 2014
9597fe1
#188 - rename UpgradedImageIExtractor to ImageExtractor
Dec 31, 2014
a5e96e7
#188 - ImageExtractor extends from BaseExtractor
Dec 31, 2014
0492fb8
#188 - move title extractor from content to title extractor class
Dec 31, 2014
12dfda5
#188 - move links extraction to LinksExtractor class
Dec 31, 2014
2608e43
#188 - move tweet extraction to TweetExtractor class
Dec 31, 2014
4de0f4b
#188 - move authors extraction to AuthorsExtractor class
Dec 31, 2014
cd4cc7e
#188 - renamve authors class file
Dec 31, 2014
8eb74d8
#188 - move tags extraction to TagsExtractor class
Dec 31, 2014
1cb9ed4
#188 - move opengraph extraction to OpenGraphExtractor class
Dec 31, 2014
8320262
#188 - move publishdate extraction to PublishDateExtractor class
Dec 31, 2014
08fd6b9
#188 - move meta extraction to MetasExtractor class
Dec 31, 2014
4584341
#188 - rename meta extractor file
Dec 31, 2014
530ab52
#188 - move domain extraction to meta extractor
Dec 31, 2014
49f50b0
#188 - move test files
Dec 31, 2014
6009d44
#188 - tests refactor
Dec 31, 2014
26ba835
#188 - move image test case
Dec 31, 2014
ff4449c
#188 - remove useless file
Dec 31, 2014
c381993
#188 - news extractos tests files
Dec 31, 2014
0e6a771
#188 - test refactor video image tags publishdate
Dec 31, 2014
b762ea8
#188 - move tweets tests case
Dec 31, 2014
ea693a9
#188 - test refactor
Dec 31, 2014
41e951c
#188 - move authors tests
Dec 31, 2014
9be09b8
#188 - move title tests
Dec 31, 2014
6959185
#188 - add empty meta test case
Dec 31, 2014
7f2f5fb
Merge branch 'feature/extractor-refactor-188' into develop
Dec 31, 2014
ca1d824
bump version
Dec 31, 2014
9e28861
Merge branch 'release/1.0.24'
Dec 31, 2014
595209e
Merge branch 'release/1.0.24' into develop
Dec 31, 2014
f9f1f1d
191 - keep available parsers list unchanged during multiple extract()…
harudark Jan 2, 2015
1daa55c
Merge branch 'randvis-bugfixing/191' into develop
Jan 2, 2015
c583da2
bump version
Jan 3, 2015
840ced1
Merge branch 'release/1.0.25'
Jan 3, 2015
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@ env/
*~
.idea
._*
*.egg
venv/
goose_extractor.egg-info/
10 changes: 8 additions & 2 deletions goose/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,14 @@ def shutdown_network(self):
pass

def crawl(self, crawl_candiate):
crawler = Crawler(self.config)
article = crawler.crawl(crawl_candiate)
parsers = list(self.config.available_parsers)
parsers.remove(self.config.parser_class)
try:
crawler = Crawler(self.config)
article = crawler.crawl(crawl_candiate)
except (UnicodeDecodeError, ValueError):
self.config.parser_class = parsers[0]
return self.crawl(crawl_candiate)
return article

def initialize(self):
Expand Down
61 changes: 59 additions & 2 deletions goose/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class Article(object):

def __init__(self):
# title of the article
self.title = None
self.title = u""

# stores the lovely, pure text from the article,
# stripped of html, formatting, etc...
Expand Down Expand Up @@ -62,12 +62,24 @@ def __init__(self):

# holds a set of tags that may have
# been in the artcle, these are not meta keywords
self.tags = set()
self.tags = []

# holds a dict of all opengrah data found
self.opengraph = {}

# holds twitter embeds
self.tweets = []

# holds a list of any movies
# we found on the page like youtube, vimeo
self.movies = []

# holds links found in the main article
self.links = []

# hold author names
self.authors = []

# stores the final URL that we're going to try
# and fetch content against, this would be expanded if any
self.final_url = u""
Expand All @@ -94,3 +106,48 @@ def __init__(self):

# A property bucket for consumers of goose to store custom data extractions.
self.additional_data = {}

@property
def infos(self):
data = {
"meta": {
"description": self.meta_description,
"lang": self.meta_lang,
"keywords": self.meta_keywords,
"favicon": self.meta_favicon,
"canonical": self.canonical_link,
},
"image": None,
"domain": self.domain,
"title": self.title,
"cleaned_text": self.cleaned_text,
"opengraph": self.opengraph,
"tags": self.tags,
"tweets": self.tweets,
"movies": [],
"links": self.links,
"authors": self.authors,
"publish_date": self.publish_date
}

# image
if self.top_image is not None:
data['image'] = {
'url': self.top_image.src,
'width': self.top_image.width,
'height': self.top_image.height,
'type': 'image'
}

# movies
for movie in self.movies:
data['movies'].append({
'embed_type': movie.embed_type,
'provider': movie.provider,
'width': movie.width,
'height': movie.height,
'embed_code': movie.embed_code,
'src': movie.src,
})

return data
3 changes: 2 additions & 1 deletion goose/cleaners.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,8 @@ def div_to_para(self, doc, dom_type):
bad_divs += 1
elif div is not None:
replaceNodes = self.get_replacement_nodes(doc, div)
div.clear()
for child in self.parser.childNodes(div):
div.remove(child)

for c, n in enumerate(replaceNodes):
div.insert(c, n)
Expand Down
20 changes: 7 additions & 13 deletions goose/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@

HTTP_DEFAULT_TIMEOUT = 30

AVAILABLE_PARSERS = {
'lxml': Parser,
'soup': ParserSoup,
}


class Configuration(object):

Expand Down Expand Up @@ -84,6 +89,7 @@ def __init__(self):
self.additional_data_extractor = None

# Parser type
self.available_parsers = AVAILABLE_PARSERS.keys()
self.parser_class = 'lxml'

# set the local storage path
Expand All @@ -94,19 +100,7 @@ def __init__(self):
self.http_timeout = HTTP_DEFAULT_TIMEOUT

def get_parser(self):
return Parser if self.parser_class == 'lxml' else ParserSoup

def get_publishdate_extractor(self):
return self.extract_publishdate

def set_publishdate_extractor(self, extractor):
"""\
Pass in to extract article publish dates.
@param extractor a concrete instance of PublishDateExtractor
"""
if not extractor:
raise ValueError("extractor must not be null!")
self.extract_publishdate = extractor
return AVAILABLE_PARSERS[self.parser_class]

def get_additionaldata_extractor(self):
return self.additional_data_extractor
Expand Down
117 changes: 100 additions & 17 deletions goose/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,20 @@
from copy import deepcopy
from goose.article import Article
from goose.utils import URLHelper, RawHelper
from goose.extractors import StandardContentExtractor
from goose.extractors.content import StandardContentExtractor
from goose.extractors.videos import VideoExtractor
from goose.extractors.title import TitleExtractor
from goose.extractors.images import ImageExtractor
from goose.extractors.links import LinksExtractor
from goose.extractors.tweets import TweetsExtractor
from goose.extractors.authors import AuthorsExtractor
from goose.extractors.tags import TagsExtractor
from goose.extractors.opengraph import OpenGraphExtractor
from goose.extractors.publishdate import PublishDateExtractor
from goose.extractors.metas import MetasExtractor
from goose.cleaners import StandardDocumentCleaner
from goose.outputformatters import StandardOutputFormatter
from goose.images.extractors import UpgradedImageIExtractor
from goose.videos.extractors import VideoExtractor

from goose.network import HtmlFetcher


Expand Down Expand Up @@ -63,9 +72,33 @@ def __init__(self, config):
# init the output formatter
self.formatter = self.get_formatter()

# metas extractor
self.metas_extractor = self.get_metas_extractor()

# publishdate extractor
self.publishdate_extractor = self.get_publishdate_extractor()

# opengraph extractor
self.opengraph_extractor = self.get_opengraph_extractor()

# tags extractor
self.tags_extractor = self.get_tags_extractor()

# authors extractor
self.authors_extractor = self.get_authors_extractor()

# tweets extractor
self.tweets_extractor = self.get_tweets_extractor()

# links extractor
self.links_extractor = self.get_links_extractor()

# video extractor
self.video_extractor = self.get_video_extractor()

# title extractor
self.title_extractor = self.get_title_extractor()

# image extrator
self.image_extractor = self.get_image_extractor()

Expand Down Expand Up @@ -95,17 +128,37 @@ def crawl(self, crawl_candidate):
self.article.raw_html = raw_html
self.article.doc = doc
self.article.raw_doc = deepcopy(doc)
# TODO
# self.article.publish_date = config.publishDateExtractor.extract(doc)
# self.article.additional_data = config.get_additionaldata_extractor.extract(doc)
self.article.title = self.extractor.get_title()
self.article.meta_lang = self.extractor.get_meta_lang()
self.article.meta_favicon = self.extractor.get_favicon()
self.article.meta_description = self.extractor.get_meta_description()
self.article.meta_keywords = self.extractor.get_meta_keywords()
self.article.canonical_link = self.extractor.get_canonical_link()
self.article.domain = self.extractor.get_domain()
self.article.tags = self.extractor.extract_tags()

# open graph
self.article.opengraph = self.opengraph_extractor.extract()

# publishdate
self.article.publish_date = self.publishdate_extractor.extract()

# meta
metas = self.metas_extractor.extract()
self.article.meta_lang = metas['lang']
self.article.meta_favicon = metas['favicon']
self.article.meta_description = metas['description']
self.article.meta_keywords = metas['keywords']
self.article.canonical_link = metas['canonical']
self.article.domain = metas['domain']

# tags
self.article.tags = self.tags_extractor.extract()

# authors
self.article.authors = self.authors_extractor.extract()

# title
self.article.title = self.title_extractor.extract()

# check for known node as content body
# if we find one force the article.doc to be the found node
# this will prevent the cleaner to remove unwanted text content
article_body = self.extractor.get_known_article_tags()
if article_body is not None:
self.article.doc = article_body

# before we do any calcs on the body itself let's clean up the document
self.article.doc = self.cleaner.clean()
Expand All @@ -117,10 +170,16 @@ def crawl(self, crawl_candidate):
# let's process it
if self.article.top_node is not None:

# video handeling
# article links
self.article.links = self.links_extractor.extract()

# tweets
self.article.tweets = self.tweets_extractor.extract()

# video handling
self.video_extractor.get_videos()

# image handeling
# image handling
if self.config.enable_image_fetching:
self.get_image()

Expand Down Expand Up @@ -160,8 +219,32 @@ def get_html(self, crawl_candidate, parsing_candidate):
})
return html

def get_metas_extractor(self):
return MetasExtractor(self.config, self.article)

def get_publishdate_extractor(self):
return PublishDateExtractor(self.config, self.article)

def get_opengraph_extractor(self):
return OpenGraphExtractor(self.config, self.article)

def get_tags_extractor(self):
return TagsExtractor(self.config, self.article)

def get_authors_extractor(self):
return AuthorsExtractor(self.config, self.article)

def get_tweets_extractor(self):
return TweetsExtractor(self.config, self.article)

def get_links_extractor(self):
return LinksExtractor(self.config, self.article)

def get_title_extractor(self):
return TitleExtractor(self.config, self.article)

def get_image_extractor(self):
return UpgradedImageIExtractor(self.config, self.article)
return ImageExtractor(self.config, self.article)

def get_video_extractor(self):
return VideoExtractor(self.config, self.article)
Expand Down
38 changes: 38 additions & 0 deletions goose/extractors/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# -*- coding: utf-8 -*-
"""\
This is a python port of "Goose" orignialy licensed to Gravity.com
under one or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership.

Python port was written by Xavier Grangier for Recrutae

Gravity.com licenses this file
to you under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance
with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""


class BaseExtractor(object):

def __init__(self, config, article):
# config
self.config = config

# parser
self.parser = self.config.get_parser()

# article
self.article = article

# stopwords class
self.stopwords_class = config.stopwords_class
Loading