Skip to content

Python 3.4 support (with compatibility to 2.6 and 2.7) #64

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jul 26, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,6 @@ dist
/man
nosetests.xml
.coverage
.tox
.idea
.cache
6 changes: 6 additions & 0 deletions readability/compat/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""
This module contains compatibility helpers for Python 2/3 interoperability.

It mainly exists because their are certain incompatibilities in the Python
syntax that can only be solved by conditionally importing different functions.
"""
6 changes: 6 additions & 0 deletions readability/compat/three.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
def raise_with_traceback(exc_type, traceback, *args, **kwargs):
"""
Raise a new exception of type `exc_type` with an existing `traceback`. All
additional (keyword-)arguments are forwarded to `exc_type`
"""
raise exc_type(*args, **kwargs).with_traceback(traceback)
6 changes: 6 additions & 0 deletions readability/compat/two.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
def raise_with_traceback(exc_type, traceback, *args, **kwargs):
"""
Raise a new exception of type `exc_type` with an existing `traceback`. All
additional (keyword-)arguments are forwarded to `exc_type`
"""
raise exc_type(*args, **kwargs), None, traceback
28 changes: 17 additions & 11 deletions readability/encoding.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,33 @@
import re
import chardet
import sys

def get_encoding(page):
# Regex for XML and HTML Meta charset declaration
charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
charset_re = re.compile(br'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
pragma_re = re.compile(br'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
xml_re = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]')

declared_encodings = (charset_re.findall(page) +
pragma_re.findall(page) +
xml_re.findall(page))

# Try any declared encodings
if len(declared_encodings) > 0:
for declared_encoding in declared_encodings:
try:
page.decode(custom_decode(declared_encoding))
return custom_decode(declared_encoding)
except UnicodeDecodeError:
pass
for declared_encoding in declared_encodings:
try:
if sys.version_info[0] == 3:
# declared_encoding will actually be bytes but .decode() only
# accepts `str` type. Decode blindly with ascii because no one should
# ever use non-ascii characters in the name of an encoding.
declared_encoding = declared_encoding.decode('ascii', 'replace')

page.decode(custom_decode(declared_encoding))
return custom_decode(declared_encoding)
except UnicodeDecodeError:
pass

# Fallback to chardet if declared encodings fail
text = re.sub('</?[^>]*>\s*', ' ', page)
text = re.sub(b'</?[^>]*>\s*', b' ', page)
enc = 'utf-8'
if not text.strip() or len(text) < 10:
return enc # can't guess
Expand Down
9 changes: 6 additions & 3 deletions readability/htmls.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@

utf8_parser = lxml.html.HTMLParser(encoding='utf-8')

if sys.version_info[0] == 2:
str = unicode

def build_doc(page):
if isinstance(page, unicode):
if isinstance(page, str):
enc = None
page_unicode = page
else:
Expand All @@ -33,7 +36,7 @@ def normalize_entities(cur_title):
u'\u00BB': '"',
u'&quot;': '"',
}
for c, r in entities.iteritems():
for c, r in entities.items():
if c in cur_title:
cur_title = cur_title.replace(c, r)

Expand Down Expand Up @@ -105,7 +108,7 @@ def shorten_title(doc):

def get_body(doc):
[ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
raw_html = unicode(tostring(doc.body or doc))
raw_html = str(tostring(doc.body or doc))
cleaned = clean_attributes(raw_html)
try:
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
Expand Down
28 changes: 18 additions & 10 deletions readability/readability.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python
from __future__ import print_function
import logging
import re
import sys
Expand All @@ -20,6 +21,8 @@
logging.basicConfig(level=logging.INFO)
log = logging.getLogger()

if sys.version_info[0] == 2:
str = unicode

REGEXES = {
'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I),
Expand Down Expand Up @@ -81,11 +84,12 @@ def text_length(i):
def compile_pattern(elements):
if not elements:
return None
if isinstance(elements, regexp_type):
elif isinstance(elements, regexp_type):
return elements
if isinstance(elements, basestring):
else:
# assume string or string like object
elements = elements.split(',')
return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)

class Document:
"""Class to build a etree document out of html."""
Expand Down Expand Up @@ -195,9 +199,13 @@ def summary(self, html_partial=False):
continue
else:
return cleaned_article
except StandardError, e:
except Exception as e:
log.exception('error getting summary: ')
raise Unparseable(str(e)), None, sys.exc_info()[2]
if sys.version_info[0] == 2:
from .compat.two import raise_with_traceback
else:
from .compat.three import raise_with_traceback
raise_with_traceback(Unparseable, sys.exc_info()[2], str(e))

def get_article(self, candidates, best_candidate, html_partial=False):
# Now that we have the top candidate, look through its siblings for
Expand Down Expand Up @@ -388,7 +396,7 @@ def transform_misused_divs_into_paragraphs(self):
# This results in incorrect results in case there is an <img>
# buried within an <a> for example
if not REGEXES['divToPElementsRe'].search(
unicode(''.join(map(tostring, list(elem))))):
str(''.join(map(str, map(tostring, list(elem)))))):
#self.debug("Altering %s to p" % (describe(elem)))
elem.tag = "p"
#print "Fixed element "+describe(elem)
Expand Down Expand Up @@ -609,18 +617,18 @@ def main():

file = None
if options.url:
import urllib
file = urllib.urlopen(options.url)
import urllib.request, urllib.parse, urllib.error
file = urllib.request.urlopen(options.url)
else:
file = open(args[0], 'rt')
enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING
try:
print Document(file.read(),
print(Document(file.read(),
debug=options.verbose,
url=options.url,
positive_keywords = options.positive_keywords,
negative_keywords = options.negative_keywords,
).summary().encode(enc, 'replace')
).summary().encode(enc, 'replace'))
finally:
file.close()

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
-e .
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python
from __future__ import print_function
from setuptools import setup, find_packages
import sys

Expand All @@ -8,7 +9,7 @@
mac_ver = platform.mac_ver()[0]
mac_ver_no = int(mac_ver.split('.')[1])
if mac_ver_no < 9:
print "Using lxml<2.4"
print("Using lxml<2.4")
lxml_requirement = "lxml<2.4"

setup(
Expand Down
20 changes: 20 additions & 0 deletions tox.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Tox (http://tox.testrun.org/) is a tool for running tests
# in multiple virtualenvs. This configuration file will run the
# test suite on all supported python versions. To use it, "pip install tox"
# and then run "tox" from this directory.

[tox]
envlist = py26, py27, py33, py34

[testenv]
deps=pytest
# This creates the virtual envs with --site-packages so already packages
# that are already installed will be reused. This is especially useful on
# Windows. Since we use lxml instead of compiling it locally (which in turn
# requires a Compiler and the build dependencies), you can download
# it from http://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml and install it via
# $PYTHONDIR\Scripts\pip.exe install *.whl
sitepackages=True
commands =
pip install -r requirements.txt
py.test