jjlee · albertz · May 16, 2011 · May 16, 2011 · May 16, 2011 · May 16, 2011
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@
 /docs/html
 /docs/styles
 *.egg-info/
+*.pyc
diff --git a/mechanize/_html.py b/mechanize/_html.py
@@ -149,15 +149,17 @@ def set_response(self, response, base_url, encoding):
         self._encoding = encoding
         self._base_url = base_url
 
-    def links(self):
+    def links(self, urltags=None):
         """Return an iterator that provides links of the document."""
+        if urltags is None: urltags = self.urltags
         response = self._response
         encoding = self._encoding
         base_url = self._base_url
+        response.seek(0)
         p = self.link_parser_class(response, encoding=encoding)
-
+        
         try:
-            for token in p.tags(*(self.urltags.keys()+["base"])):
+            for token in p.tags(*(urltags.keys()+["base"])):
                 if token.type == "endtag":
                     continue
                 if token.data == "base":
@@ -171,7 +173,7 @@ def links(self):
                 # XXX use attr_encoding for ref'd doc if that doc does not
                 #  provide one by other means
                 #attr_encoding = attrs.get("charset")
-                url = attrs.get(self.urltags[tag])  # XXX is "" a valid URL?
+                url = attrs.get(urltags[tag])  # XXX is "" a valid URL?
                 if not url:
                     # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
                     # For our purposes a link is something with a URL, so
@@ -383,13 +385,14 @@ def set_soup(self, soup, base_url, encoding):
         self._base_url = base_url
         self._encoding = encoding
 
-    def links(self):
+    def links(self, urltags = None):
+        if urltags is None: urltags = self.urltags
         bs = self._bs
         base_url = self._base_url
         encoding = self._encoding
         for ch in bs.recursiveChildGenerator():
             if (isinstance(ch, _beautifulsoup.Tag) and
-                ch.name in self.urltags.keys()+["base"]):
+                ch.name in urltags.keys()+["base"]):
                 link = ch
                 attrs = bs.unescape_attrs(link.attrs)
                 attrs_dict = dict(attrs)
@@ -398,7 +401,7 @@ def links(self):
                     if base_href is not None:
                         base_url = base_href
                     continue
-                url_attr = self.urltags[link.name]
+                url_attr = urltags[link.name]
                 url = attrs_dict.get(url_attr)
                 if not url:
                     continue
@@ -576,6 +579,13 @@ def links(self):
                 raise
         return self._links_genf()
 
+    def custom_links(self, **kwargs):
+        """Return iterable over mechanize.Link-like objects.
+
+        Raises mechanize.ParseError on failure.
+        """
+        return self._links_factory.links(**kwargs)
+
 class DefaultFactory(Factory):
     """Based on sgmllib."""
     def __init__(self, i_want_broken_xhtml_support=False):

diff --git a/mechanize/_mechanize.py b/mechanize/_mechanize.py
@@ -11,7 +11,7 @@
 
 import copy, re, os, urllib, urllib2
 
-from _html import DefaultFactory
+from _html import DefaultFactory, RobustFactory
 import _response
 import _request
 import _rfc3986
@@ -409,6 +409,16 @@ def links(self, **kwds):
         else:
             return links
 
+    def images(self, **kwds):
+        """Return iterable over images (mechanize.Link objects)."""
+        if not self.viewing_html():
+            raise BrowserStateError("not viewing HTML")
+        imgs = self._factory.custom_links(urltags = {"img": "src"})
+        if kwds:
+            return self._filter_links(imgs, **kwds)
+        else:
+            return imgs
+
     def forms(self):
         """Return iterable over forms.
 

diff --git a/mechanize/_pullparser.py b/mechanize/_pullparser.py
@@ -381,7 +381,9 @@ def unknown_starttag(self, tag, attrs):
         self._tokenstack.append(Token("starttag", tag, attrs))
     def unknown_endtag(self, tag):
         self._tokenstack.append(Token("endtag", tag))
-
+    def error_handler(self, exc, pos):
+        #print "parser exception:", exc
+        return pos + 1
 
 def _test():
    import doctest, _pullparser

diff --git a/mechanize/_sgmllib_copy.py b/mechanize/_sgmllib_copy.py
@@ -116,6 +116,103 @@ def close(self):
     def error(self, message):
         raise SGMLParseError(message)
 
+    # None or function (exception,pos) -> new_pos
+    error_handler = None
+
+    def _goahead(self, i, rawdata):
+        n = len(rawdata)
+        if self.nomoretags:
+            self.handle_data(rawdata[i:n])
+            i = n
+            return i
+        match = interesting.search(rawdata, i)
+        if match: j = match.start()
+        else: j = n
+        if i < j:
+            self.handle_data(rawdata[i:j])
+        i = j
+        if i == n: return i
+        if rawdata[i] == '<':
+            if starttagopen.match(rawdata, i):
+                if self.literal:
+                    self.handle_data(rawdata[i])
+                    i = i+1
+                    return i
+                k = self.parse_starttag(i)
+                if k < 0: return None
+                i = k
+                return i
+            if rawdata.startswith("</", i):
+                k = self.parse_endtag(i)
+                if k < 0: return None
+                i = k
+                self.literal = 0
+                return i
+            if self.literal:
+                if n > (i + 1):
+                    self.handle_data("<")
+                    i = i+1
+                else:
+                    # incomplete
+                    return None
+                return i
+            if rawdata.startswith("<!--", i):
+                    # Strictly speaking, a comment is --.*--
+                    # within a declaration tag <!...>.
+                    # This should be removed,
+                    # and comments handled only in parse_declaration.
+                k = self.parse_comment(i)
+                if k < 0: return None
+                i = k
+                return i
+            if rawdata.startswith("<?", i):
+                k = self.parse_pi(i)
+                if k < 0: return None
+                i = i+k
+                return i
+            if rawdata.startswith("<!", i):
+                # This is some sort of declaration; in "HTML as
+                # deployed," this should only be the document type
+                # declaration ("<!DOCTYPE html...>").
+                k = self.parse_declaration(i)
+                if k < 0: return None
+                i = k
+                return i
+        elif rawdata[i] == '&':
+            if self.literal:
+                self.handle_data(rawdata[i])
+                i = i+1
+                return i
+            match = charref.match(rawdata, i)
+            if match:
+                name = match.group(1)
+                self.handle_charref(name)
+                i = match.end(0)
+                if rawdata[i-1] != ';': i = i-1
+                return i
+            match = entityref.match(rawdata, i)
+            if match:
+                name = match.group(1)
+                self.handle_entityref(name)
+                i = match.end(0)
+                if rawdata[i-1] != ';': i = i-1
+                return i
+        else:
+            self.error('neither < nor & ??')
+        # We get here only if incomplete matches but
+        # nothing else
+        match = incomplete.match(rawdata, i)
+        if not match:
+            self.handle_data(rawdata[i])
+            i = i+1
+            return i
+        j = match.end(0)
+        if j == n:
+            return None # Really incomplete
+        self.handle_data(rawdata[i:j])
+        i = j
+        return i
+
     # Internal -- handle data as far as reasonable.  May leave state
     # and data to be processed by a subsequent call.  If 'end' is
     # true, force handling all data as if followed by EOF marker.
@@ -124,96 +221,15 @@ def goahead(self, end):
         i = 0
         n = len(rawdata)
         while i < n:
-            if self.nomoretags:
-                self.handle_data(rawdata[i:n])
-                i = n
-                break
-            match = interesting.search(rawdata, i)
-            if match: j = match.start()
-            else: j = n
-            if i < j:
-                self.handle_data(rawdata[i:j])
-            i = j
-            if i == n: break
-            if rawdata[i] == '<':
-                if starttagopen.match(rawdata, i):
-                    if self.literal:
-                        self.handle_data(rawdata[i])
-                        i = i+1
-                        continue
-                    k = self.parse_starttag(i)
-                    if k < 0: break
-                    i = k
-                    continue
-                if rawdata.startswith("</", i):
-                    k = self.parse_endtag(i)
-                    if k < 0: break
-                    i = k
-                    self.literal = 0
-                    continue
-                if self.literal:
-                    if n > (i + 1):
-                        self.handle_data("<")
-                        i = i+1
-                    else:
-                        # incomplete
-                        break
-                    continue
-                if rawdata.startswith("<!--", i):
-                        # Strictly speaking, a comment is --.*--
-                        # within a declaration tag <!...>.
-                        # This should be removed,
-                        # and comments handled only in parse_declaration.
-                    k = self.parse_comment(i)
-                    if k < 0: break
-                    i = k
-                    continue
-                if rawdata.startswith("<?", i):
-                    k = self.parse_pi(i)
-                    if k < 0: break
-                    i = i+k
-                    continue
-                if rawdata.startswith("<!", i):
-                    # This is some sort of declaration; in "HTML as
-                    # deployed," this should only be the document type
-                    # declaration ("<!DOCTYPE html...>").
-                    k = self.parse_declaration(i)
-                    if k < 0: break
-                    i = k
-                    continue
-            elif rawdata[i] == '&':
-                if self.literal:
-                    self.handle_data(rawdata[i])
-                    i = i+1
-                    continue
-                match = charref.match(rawdata, i)
-                if match:
-                    name = match.group(1)
-                    self.handle_charref(name)
-                    i = match.end(0)
-                    if rawdata[i-1] != ';': i = i-1
-                    continue
-                match = entityref.match(rawdata, i)
-                if match:
-                    name = match.group(1)
-                    self.handle_entityref(name)
-                    i = match.end(0)
-                    if rawdata[i-1] != ';': i = i-1
-                    continue
+            if self.error_handler is None:
+                nexti = self._goahead(i, rawdata)
             else:
-                self.error('neither < nor & ??')
-            # We get here only if incomplete matches but
-            # nothing else
-            match = incomplete.match(rawdata, i)
-            if not match:
-                self.handle_data(rawdata[i])
-                i = i+1
-                continue
-            j = match.end(0)
-            if j == n:
-                break # Really incomplete
-            self.handle_data(rawdata[i:j])
-            i = j
+                try:
+                    nexti = self._goahead(i, rawdata)
+                except Exception, e:
+                    nexti = self.error_handler(e, i)
+            if nexti is None: break
+            i = nexti            
         # end while
         if end and i < n:
             self.handle_data(rawdata[i:n])