Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
/docs/html
/docs/styles
*.egg-info/
*.pyc
24 changes: 17 additions & 7 deletions mechanize/_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,15 +149,17 @@ def set_response(self, response, base_url, encoding):
self._encoding = encoding
self._base_url = base_url

def links(self):
def links(self, urltags=None):
"""Return an iterator that provides links of the document."""
if urltags is None: urltags = self.urltags
response = self._response
encoding = self._encoding
base_url = self._base_url
response.seek(0)
p = self.link_parser_class(response, encoding=encoding)

try:
for token in p.tags(*(self.urltags.keys()+["base"])):
for token in p.tags(*(urltags.keys()+["base"])):
if token.type == "endtag":
continue
if token.data == "base":
Expand All @@ -171,7 +173,7 @@ def links(self):
# XXX use attr_encoding for ref'd doc if that doc does not
# provide one by other means
#attr_encoding = attrs.get("charset")
url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL?
url = attrs.get(urltags[tag]) # XXX is "" a valid URL?
if not url:
# Probably an <A NAME="blah"> link or <AREA NOHREF...>.
# For our purposes a link is something with a URL, so
Expand Down Expand Up @@ -383,13 +385,14 @@ def set_soup(self, soup, base_url, encoding):
self._base_url = base_url
self._encoding = encoding

def links(self):
def links(self, urltags = None):
if urltags is None: urltags = self.urltags
bs = self._bs
base_url = self._base_url
encoding = self._encoding
for ch in bs.recursiveChildGenerator():
if (isinstance(ch, _beautifulsoup.Tag) and
ch.name in self.urltags.keys()+["base"]):
ch.name in urltags.keys()+["base"]):
link = ch
attrs = bs.unescape_attrs(link.attrs)
attrs_dict = dict(attrs)
Expand All @@ -398,7 +401,7 @@ def links(self):
if base_href is not None:
base_url = base_href
continue
url_attr = self.urltags[link.name]
url_attr = urltags[link.name]
url = attrs_dict.get(url_attr)
if not url:
continue
Expand Down Expand Up @@ -576,6 +579,13 @@ def links(self):
raise
return self._links_genf()

def custom_links(self, **kwargs):
"""Return iterable over mechanize.Link-like objects.

Raises mechanize.ParseError on failure.
"""
return self._links_factory.links(**kwargs)

class DefaultFactory(Factory):
"""Based on sgmllib."""
def __init__(self, i_want_broken_xhtml_support=False):
Expand Down
12 changes: 11 additions & 1 deletion mechanize/_mechanize.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

import copy, re, os, urllib, urllib2

from _html import DefaultFactory
from _html import DefaultFactory, RobustFactory
import _response
import _request
import _rfc3986
Expand Down Expand Up @@ -409,6 +409,16 @@ def links(self, **kwds):
else:
return links

def images(self, **kwds):
"""Return iterable over images (mechanize.Link objects)."""
if not self.viewing_html():
raise BrowserStateError("not viewing HTML")
imgs = self._factory.custom_links(urltags = {"img": "src"})
if kwds:
return self._filter_links(imgs, **kwds)
else:
return imgs

def forms(self):
"""Return iterable over forms.

Expand Down
4 changes: 3 additions & 1 deletion mechanize/_pullparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,9 @@ def unknown_starttag(self, tag, attrs):
self._tokenstack.append(Token("starttag", tag, attrs))
def unknown_endtag(self, tag):
self._tokenstack.append(Token("endtag", tag))

def error_handler(self, exc, pos):
#print "parser exception:", exc
return pos + 1

def _test():
import doctest, _pullparser
Expand Down
194 changes: 105 additions & 89 deletions mechanize/_sgmllib_copy.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,103 @@ def close(self):
def error(self, message):
raise SGMLParseError(message)

# None or function (exception,pos) -> new_pos
error_handler = None

def _goahead(self, i, rawdata):
n = len(rawdata)
if self.nomoretags:
self.handle_data(rawdata[i:n])
i = n
return i
match = interesting.search(rawdata, i)
if match: j = match.start()
else: j = n
if i < j:
self.handle_data(rawdata[i:j])
i = j
if i == n: return i
if rawdata[i] == '<':
if starttagopen.match(rawdata, i):
if self.literal:
self.handle_data(rawdata[i])
i = i+1
return i
k = self.parse_starttag(i)
if k < 0: return None
i = k
return i
if rawdata.startswith("</", i):
k = self.parse_endtag(i)
if k < 0: return None
i = k
self.literal = 0
return i
if self.literal:
if n > (i + 1):
self.handle_data("<")
i = i+1
else:
# incomplete
return None
return i
if rawdata.startswith("<!--", i):
# Strictly speaking, a comment is --.*--
# within a declaration tag <!...>.
# This should be removed,
# and comments handled only in parse_declaration.
k = self.parse_comment(i)
if k < 0: return None
i = k
return i
if rawdata.startswith("<?", i):
k = self.parse_pi(i)
if k < 0: return None
i = i+k
return i
if rawdata.startswith("<!", i):
# This is some sort of declaration; in "HTML as
# deployed," this should only be the document type
# declaration ("<!DOCTYPE html...>").
k = self.parse_declaration(i)
if k < 0: return None
i = k
return i
elif rawdata[i] == '&':
if self.literal:
self.handle_data(rawdata[i])
i = i+1
return i
match = charref.match(rawdata, i)
if match:
name = match.group(1)
self.handle_charref(name)
i = match.end(0)
if rawdata[i-1] != ';': i = i-1
return i
match = entityref.match(rawdata, i)
if match:
name = match.group(1)
self.handle_entityref(name)
i = match.end(0)
if rawdata[i-1] != ';': i = i-1
return i
else:
self.error('neither < nor & ??')
# We get here only if incomplete matches but
# nothing else
match = incomplete.match(rawdata, i)
if not match:
self.handle_data(rawdata[i])
i = i+1
return i
j = match.end(0)
if j == n:
return None # Really incomplete
self.handle_data(rawdata[i:j])
i = j
return i

# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker.
Expand All @@ -124,96 +221,15 @@ def goahead(self, end):
i = 0
n = len(rawdata)
while i < n:
if self.nomoretags:
self.handle_data(rawdata[i:n])
i = n
break
match = interesting.search(rawdata, i)
if match: j = match.start()
else: j = n
if i < j:
self.handle_data(rawdata[i:j])
i = j
if i == n: break
if rawdata[i] == '<':
if starttagopen.match(rawdata, i):
if self.literal:
self.handle_data(rawdata[i])
i = i+1
continue
k = self.parse_starttag(i)
if k < 0: break
i = k
continue
if rawdata.startswith("</", i):
k = self.parse_endtag(i)
if k < 0: break
i = k
self.literal = 0
continue
if self.literal:
if n > (i + 1):
self.handle_data("<")
i = i+1
else:
# incomplete
break
continue
if rawdata.startswith("<!--", i):
# Strictly speaking, a comment is --.*--
# within a declaration tag <!...>.
# This should be removed,
# and comments handled only in parse_declaration.
k = self.parse_comment(i)
if k < 0: break
i = k
continue
if rawdata.startswith("<?", i):
k = self.parse_pi(i)
if k < 0: break
i = i+k
continue
if rawdata.startswith("<!", i):
# This is some sort of declaration; in "HTML as
# deployed," this should only be the document type
# declaration ("<!DOCTYPE html...>").
k = self.parse_declaration(i)
if k < 0: break
i = k
continue
elif rawdata[i] == '&':
if self.literal:
self.handle_data(rawdata[i])
i = i+1
continue
match = charref.match(rawdata, i)
if match:
name = match.group(1)
self.handle_charref(name)
i = match.end(0)
if rawdata[i-1] != ';': i = i-1
continue
match = entityref.match(rawdata, i)
if match:
name = match.group(1)
self.handle_entityref(name)
i = match.end(0)
if rawdata[i-1] != ';': i = i-1
continue
if self.error_handler is None:
nexti = self._goahead(i, rawdata)
else:
self.error('neither < nor & ??')
# We get here only if incomplete matches but
# nothing else
match = incomplete.match(rawdata, i)
if not match:
self.handle_data(rawdata[i])
i = i+1
continue
j = match.end(0)
if j == n:
break # Really incomplete
self.handle_data(rawdata[i:j])
i = j
try:
nexti = self._goahead(i, rawdata)
except Exception, e:
nexti = self.error_handler(e, i)
if nexti is None: break
i = nexti
# end while
if end and i < n:
self.handle_data(rawdata[i:n])
Expand Down