1"""Beautiful Soup
2Elixir and Tonic
3"The Screen-Scraper's Friend"
4v3.0.0
5http://www.crummy.com/software/BeautifulSoup/
6
7Beautiful Soup parses a (possibly invalid) XML or HTML document into a
8tree representation. It provides methods and Pythonic idioms that make
9it easy to navigate, search, and modify the tree.
10
11A well-formed XML/HTML document yields a well-formed data
12structure. An ill-formed XML/HTML document yields a correspondingly
13ill-formed data structure. If your document is only locally
14well-formed, you can use this library to find and process the
15well-formed part of it.
16
17Beautiful Soup works with Python 2.2 and up. It has no external
18dependencies, but you'll have more success at converting data to UTF-8
19if you also install these three packages:
20
21* chardet, for auto-detecting character encodings
22  http://chardet.feedparser.org/
23* cjkcodecs and iconv_codec, which add more encodings to the ones supported
24  by stock Python.
25  http://cjkpython.i18n.org/
26
27Beautiful Soup defines classes for two main parsing strategies:
28
29 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
30   language that kind of looks like XML.
31
32 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
33   or invalid. This class has web browser-like heuristics for
34   obtaining a sensible parse tree in the face of common HTML errors.
35
36Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
37the encoding of an HTML or XML document, and converting it to
38Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
39
40For more than you ever wanted to know about Beautiful Soup, see the
41documentation:
42http://www.crummy.com/software/BeautifulSoup/documentation.html
43
44"""
45from __future__ import generators
46
47__author__ = "Leonard Richardson (leonardr@segfault.org)"
48__version__ = "3.0.0"
49__date__ = "$Date: 2004/10/18 00:14:20 $"
50__copyright__ = "Copyright (c) 2004-2005 Leonard Richardson"
51__license__ = "PSF"
52
53from sgmllib import SGMLParser, SGMLParseError
54import codecs
55import types
56import re
57import sgmllib
58from htmlentitydefs import name2codepoint
59
60#This code makes Beautiful Soup able to parse XML with namespaces
61sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
62
63DEFAULT_OUTPUT_ENCODING = "utf-8"
64
65# First, the classes that represent markup elements.
66
67class PageElement:
68    """Contains the navigational information for some part of the page
69    (either a tag or a piece of text)"""
70
71    def setup(self, parent=None, previous=None):
72        """Sets up the initial relations between this element and
73        other elements."""
74        self.parent = parent
75        self.previous = previous
76        self.next = None
77        self.previousSibling = None
78        self.nextSibling = None
79        if self.parent and self.parent.contents:
80            self.previousSibling = self.parent.contents[-1]
81            self.previousSibling.nextSibling = self
82
83    def replaceWith(self, replaceWith):
84        oldParent = self.parent
85        myIndex = self.parent.contents.index(self)
86        if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
87            # We're replacing this element with one of its siblings.
88            index = self.parent.contents.index(replaceWith)
89            if index and index < myIndex:
90                # Furthermore, it comes before this element. That
91                # means that when we extract it, the index of this
92                # element will change.
93                myIndex = myIndex - 1
94        self.extract()
95        oldParent.insert(myIndex, replaceWith)
96
97    def extract(self):
98        """Destructively rips this element out of the tree."""
99        if self.parent:
100            try:
101                self.parent.contents.remove(self)
102            except ValueError:
103                pass
104
105        #Find the two elements that would be next to each other if
106        #this element (and any children) hadn't been parsed. Connect
107        #the two.
108        lastChild = self._lastRecursiveChild()
109        nextElement = lastChild.next
110
111        if self.previous:
112            self.previous.next = nextElement
113        if nextElement:
114            nextElement.previous = self.previous
115        self.previous = None
116        lastChild.next = None
117
118        self.parent = None
119        if self.previousSibling:
120            self.previousSibling.nextSibling = self.nextSibling
121        if self.nextSibling:
122            self.nextSibling.previousSibling = self.previousSibling
123        self.previousSibling = self.nextSibling = None
124
125    def _lastRecursiveChild(self):
126        "Finds the last element beneath this object to be parsed."
127        lastChild = self
128        while hasattr(lastChild, 'contents') and lastChild.contents:
129            lastChild = lastChild.contents[-1]
130        return lastChild
131
132    def insert(self, position, newChild):
133        if (isinstance(newChild, basestring)
134            or isinstance(newChild, unicode)) \
135            and not isinstance(newChild, NavigableString):
136            newChild = NavigableString(newChild)
137
138        position =  min(position, len(self.contents))
139        if hasattr(newChild, 'parent') and newChild.parent != None:
140            # We're 'inserting' an element that's already one
141            # of this object's children.
142            if newChild.parent == self:
143                index = self.find(newChild)
144                if index and index < position:
145                    # Furthermore we're moving it further down the
146                    # list of this object's children. That means that
147                    # when we extract this element, our target index
148                    # will jump down one.
149                    position = position - 1
150            newChild.extract()
151
152        newChild.parent = self
153        previousChild = None
154        if not position:
155            newChild.previousSibling = None
156            newChild.previous = self
157        else:
158            previousChild = self.contents[position-1]
159            newChild.previousSibling = previousChild
160            newChild.previousSibling.nextSibling = newChild
161            newChild.previous = previousChild._lastRecursiveChild()
162        if newChild.previous:
163            newChild.previous.next = newChild
164
165        newChildsLastElement = newChild._lastRecursiveChild()
166
167        if position >= len(self.contents):
168            newChild.nextSibling = None
169
170            parent = self
171            parentsNextSibling = None
172            while not parentsNextSibling:
173                parentsNextSibling = parent.nextSibling
174                parent = parent.parent
175                if not parent: # This is the last element in the document.
176                    break
177            if parentsNextSibling:
178                newChildsLastElement.next = parentsNextSibling
179            else:
180                newChildsLastElement.next = None
181        else:
182            nextChild = self.contents[position]
183            newChild.nextSibling = nextChild
184            if newChild.nextSibling:
185                newChild.nextSibling.previousSibling = newChild
186            newChildsLastElement.next = nextChild
187
188        if newChildsLastElement.next:
189            newChildsLastElement.next.previous = newChildsLastElement
190        self.contents.insert(position, newChild)
191
192    def findNext(self, name=None, attrs={}, text=None, **kwargs):
193        """Returns the first item that matches the given criteria and
194        appears after this Tag in the document."""
195        return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
196
197    def findAllNext(self, name=None, attrs={}, text=None, limit=None,
198                    **kwargs):
199        """Returns all items that match the given criteria and appear
200        before after Tag in the document."""
201        return self._findAll(name, attrs, text, limit, self.nextGenerator)
202
203    def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
204        """Returns the closest sibling to this Tag that matches the
205        given criteria and appears after this Tag in the document."""
206        return self._findOne(self.findNextSiblings, name, attrs, text,
207                             **kwargs)
208
209    def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
210                         **kwargs):
211        """Returns the siblings of this Tag that match the given
212        criteria and appear after this Tag in the document."""
213        return self._findAll(name, attrs, text, limit,
214                             self.nextSiblingGenerator, **kwargs)
215
216    def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
217        """Returns the first item that matches the given criteria and
218        appears before this Tag in the document."""
219        return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
220
221    def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
222                        **kwargs):
223        """Returns all items that match the given criteria and appear
224        before this Tag in the document."""
225        return self._findAll(name, attrs, text, limit, self.previousGenerator,
226                           **kwargs)
227
228    def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
229        """Returns the closest sibling to this Tag that matches the
230        given criteria and appears before this Tag in the document."""
231        return self._findOne(self.findPreviousSiblings, name, attrs, text,
232                             **kwargs)
233
234    def findPreviousSiblings(self, name=None, attrs={}, text=None,
235                             limit=None, **kwargs):
236        """Returns the siblings of this Tag that match the given
237        criteria and appear before this Tag in the document."""
238        return self._findAll(name, attrs, text, limit,
239                             self.previousSiblingGenerator, **kwargs)
240
241    def findParent(self, name=None, attrs={}, **kwargs):
242        """Returns the closest parent of this Tag that matches the given
243        criteria."""
244        # NOTE: We can't use _findOne because findParents takes a different
245        # set of arguments.
246        r = None
247        l = self.findParents(name, attrs, 1)
248        if l:
249            r = l[0]
250        return r
251
252    def findParents(self, name=None, attrs={}, limit=None, **kwargs):
253        """Returns the parents of this Tag that match the given
254        criteria."""
255
256        return self._findAll(name, attrs, None, limit, self.parentGenerator,
257                             **kwargs)
258
259    #These methods do the real heavy lifting.
260
261    def _findOne(self, method, name, attrs, text, **kwargs):
262        r = None
263        l = method(name, attrs, text, 1, **kwargs)
264        if l:
265            r = l[0]
266        return r
267
268    def _findAll(self, name, attrs, text, limit, generator, **kwargs):
269        "Iterates over a generator looking for things that match."
270
271        if isinstance(name, SoupStrainer):
272            strainer = name
273        else:
274            # Build a SoupStrainer
275            strainer = SoupStrainer(name, attrs, text, **kwargs)
276        results = ResultSet(strainer)
277        g = generator()
278        while True:
279            try:
280                i = g.next()
281            except StopIteration:
282                break
283            if i:
284                found = strainer.search(i)
285                if found:
286                    results.append(found)
287                    if limit and len(results) >= limit:
288                        break
289        return results
290
291    #These Generators can be used to navigate starting from both
292    #NavigableStrings and Tags.
293    def nextGenerator(self):
294        i = self
295        while i:
296            i = i.next
297            yield i
298
299    def nextSiblingGenerator(self):
300        i = self
301        while i:
302            i = i.nextSibling
303            yield i
304
305    def previousGenerator(self):
306        i = self
307        while i:
308            i = i.previous
309            yield i
310
311    def previousSiblingGenerator(self):
312        i = self
313        while i:
314            i = i.previousSibling
315            yield i
316
317    def parentGenerator(self):
318        i = self
319        while i:
320            i = i.parent
321            yield i
322
323    # Utility methods
324    def substituteEncoding(self, str, encoding=None):
325        encoding = encoding or "utf-8"
326        return str.replace("%SOUP-ENCODING%", encoding)
327
328    def toEncoding(self, s, encoding=None):
329        """Encodes an object to a string in some encoding, or to Unicode.
330        ."""
331        if isinstance(s, unicode):
332            if encoding:
333                s = s.encode(encoding)
334        elif isinstance(s, str):
335            if encoding:
336                s = s.encode(encoding)
337            else:
338                s = unicode(s)
339        else:
340            if encoding:
341                s  = self.toEncoding(str(s), encoding)
342            else:
343                s = unicode(s)
344        return s
345
346class NavigableString(unicode, PageElement):
347
348    def __getattr__(self, attr):
349        """text.string gives you text. This is for backwards
350        compatibility for Navigable*String, but for CData* it lets you
351        get the string without the CData wrapper."""
352        if attr == 'string':
353            return self
354        else:
355            raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
356
357    def __unicode__(self):
358        return self.__str__(None)
359
360    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
361        if encoding:
362            return self.encode(encoding)
363        else:
364            return self
365
366class CData(NavigableString):
367
368    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
369        return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
370
371class ProcessingInstruction(NavigableString):
372    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
373        output = self
374        if "%SOUP-ENCODING%" in output:
375            output = self.substituteEncoding(output, encoding)
376        return "<?%s?>" % self.toEncoding(output, encoding)
377
378class Comment(NavigableString):
379    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
380        return "<!--%s-->" % NavigableString.__str__(self, encoding)
381
382class Declaration(NavigableString):
383    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
384        return "<!%s>" % NavigableString.__str__(self, encoding)
385
386class Tag(PageElement):
387
388    """Represents a found HTML tag with its attributes and contents."""
389
390    def __init__(self, parser, name, attrs=None, parent=None,
391                 previous=None):
392        "Basic constructor."
393
394        # We don't actually store the parser object: that lets extracted
395        # chunks be garbage-collected
396        self.parserClass = parser.__class__
397        self.isSelfClosing = parser.isSelfClosingTag(name)
398        self.name = name
399        if attrs == None:
400            attrs = []
401        self.attrs = attrs
402        self.contents = []
403        self.setup(parent, previous)
404        self.hidden = False
405        self.containsSubstitutions = False
406
407    def get(self, key, default=None):
408        """Returns the value of the 'key' attribute for the tag, or
409        the value given for 'default' if it doesn't have that
410        attribute."""
411        return self._getAttrMap().get(key, default)
412
413    def has_key(self, key):
414        return self._getAttrMap().has_key(key)
415
416    def __getitem__(self, key):
417        """tag[key] returns the value of the 'key' attribute for the tag,
418        and throws an exception if it's not there."""
419        return self._getAttrMap()[key]
420
421    def __iter__(self):
422        "Iterating over a tag iterates over its contents."
423        return iter(self.contents)
424
425    def __len__(self):
426        "The length of a tag is the length of its list of contents."
427        return len(self.contents)
428
429    def __contains__(self, x):
430        return x in self.contents
431
432    def __nonzero__(self):
433        "A tag is non-None even if it has no contents."
434        return True
435
436    def __setitem__(self, key, value):
437        """Setting tag[key] sets the value of the 'key' attribute for the
438        tag."""
439        self._getAttrMap()
440        self.attrMap[key] = value
441        found = False
442        for i in range(0, len(self.attrs)):
443            if self.attrs[i][0] == key:
444                self.attrs[i] = (key, value)
445                found = True
446        if not found:
447            self.attrs.append((key, value))
448        self._getAttrMap()[key] = value
449
450    def __delitem__(self, key):
451        "Deleting tag[key] deletes all 'key' attributes for the tag."
452        for item in self.attrs:
453            if item[0] == key:
454                self.attrs.remove(item)
455                #We don't break because bad HTML can define the same
456                #attribute multiple times.
457            self._getAttrMap()
458            if self.attrMap.has_key(key):
459                del self.attrMap[key]
460
461    def __call__(self, *args, **kwargs):
462        """Calling a tag like a function is the same as calling its
463        findAll() method. Eg. tag('a') returns a list of all the A tags
464        found within this tag."""
465        return apply(self.findAll, args, kwargs)
466
467    def __getattr__(self, tag):
468        #print "Getattr %s.%s" % (self.__class__, tag)
469        if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
470            return self.find(tag[:-3])
471        elif tag.find('__'):
472            return self.find(tag)
473
474    def __eq__(self, other):
475        """Returns true iff this tag has the same name, the same attributes,
476        and the same contents (recursively) as the given tag.
477
478        NOTE: right now this will return false if two tags have the
479        same attributes in a different order. Should this be fixed?"""
480        if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
481            return False
482        for i in range(0, len(self.contents)):
483            if self.contents[i] != other.contents[i]:
484                return False
485        return True
486
487    def __ne__(self, other):
488        """Returns true iff this tag is not identical to the other tag,
489        as defined in __eq__."""
490        return not self == other
491
492    def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
493        """Renders this tag as a string."""
494        return self.__str__(encoding)
495
496    def __unicode__(self):
497        return self.__str__(None)
498
499    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
500                prettyPrint=False, indentLevel=0):
501        """Returns a string or Unicode representation of this tag and
502        its contents. To get Unicode, pass None for encoding.
503
504        NOTE: since Python's HTML parser consumes whitespace, this
505        method is not certain to reproduce the whitespace present in
506        the original string."""
507
508        encodedName = self.toEncoding(self.name, encoding)
509
510        attrs = []
511        if self.attrs:
512            for key, val in self.attrs:
513                fmt = '%s="%s"'
514                if isString(val):
515                    if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
516                        val = self.substituteEncoding(val, encoding)
517                    if '"' in val:
518                        fmt = "%s='%s'"
519                        # This can't happen naturally, but it can happen
520                        # if you modify an attribute value and print it out.
521                        if "'" in val:
522                            val = val.replace("'", "&squot;")
523                attrs.append(fmt % (self.toEncoding(key, encoding),
524                                    self.toEncoding(val, encoding)))
525        close = ''
526        closeTag = ''
527        if self.isSelfClosing:
528            close = ' /'
529        else:
530            closeTag = '</%s>' % encodedName
531
532        indentTag, indentContents = 0, 0
533        if prettyPrint:
534            indentTag = indentLevel
535            space = (' ' * (indentTag-1))
536            indentContents = indentTag + 1
537        contents = self.renderContents(encoding, prettyPrint, indentContents)
538        if self.hidden:
539            s = contents
540        else:
541            s = []
542            attributeString = ''
543            if attrs:
544                attributeString = ' ' + ' '.join(attrs)
545            if prettyPrint:
546                s.append(space)
547            s.append('<%s%s%s>' % (encodedName, attributeString, close))
548            if prettyPrint:
549                s.append("\n")
550            s.append(contents)
551            if prettyPrint and contents and contents[-1] != "\n":
552                s.append("\n")
553            if prettyPrint and closeTag:
554                s.append(space)
555            s.append(closeTag)
556            if prettyPrint and closeTag and self.nextSibling:
557                s.append("\n")
558            s = ''.join(s)
559        return s
560
561    def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
562        return self.__str__(encoding, True)
563
564    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
565                       prettyPrint=False, indentLevel=0):
566        """Renders the contents of this tag as a string in the given
567        encoding. If encoding is None, returns a Unicode string.."""
568        s=[]
569        for c in self:
570            text = None
571            if isinstance(c, NavigableString):
572                text = c.__str__(encoding)
573            elif isinstance(c, Tag):
574                s.append(c.__str__(encoding, prettyPrint, indentLevel))
575            if text and prettyPrint:
576                text = text.strip()
577            if text:
578                if prettyPrint:
579                    s.append(" " * (indentLevel-1))
580                s.append(text)
581                if prettyPrint:
582                    s.append("\n")
583        return ''.join(s)
584
585    #Soup methods
586
587    def find(self, name=None, attrs={}, recursive=True, text=None,
588              **kwargs):
589        """Return only the first child of this
590        Tag matching the given criteria."""
591        r = None
592        l = self.findAll(name, attrs, recursive, text, 1)
593        if l:
594            r = l[0]
595        return r
596    findChild = find
597
598    def findAll(self, name=None, attrs={}, recursive=True, text=None,
599                limit=None, **kwargs):
600        """Extracts a list of Tag objects that match the given
601        criteria.  You can specify the name of the Tag and any
602        attributes you want the Tag to have.
603
604        The value of a key-value pair in the 'attrs' map can be a
605        string, a list of strings, a regular expression object, or a
606        callable that takes a string and returns whether or not the
607        string matches for some custom definition of 'matches'. The
608        same is true of the tag name."""
609        generator = self.recursiveChildGenerator
610        if not recursive:
611            generator = self.childGenerator
612        return self._findAll(name, attrs, text, limit, generator, **kwargs)
613    findAllChildren = findAll
614
615    #Utility methods
616
617    def append(self, tag):
618        """Appends the given tag to the contents of this tag."""
619        self.contents.append(tag)
620
621    #Private methods
622
623    def _getAttrMap(self):
624        """Initializes a map representation of this tag's attributes,
625        if not already initialized."""
626        if not getattr(self, 'attrMap'):
627            self.attrMap = {}
628            for (key, value) in self.attrs:
629                self.attrMap[key] = value
630        return self.attrMap
631
632    #Generator methods
633    def childGenerator(self):
634        for i in range(0, len(self.contents)):
635            yield self.contents[i]
636        raise StopIteration
637
638    def recursiveChildGenerator(self):
639        stack = [(self, 0)]
640        while stack:
641            tag, start = stack.pop()
642            if isinstance(tag, Tag):
643                for i in range(start, len(tag.contents)):
644                    a = tag.contents[i]
645                    yield a
646                    if isinstance(a, Tag) and tag.contents:
647                        if i < len(tag.contents) - 1:
648                            stack.append((tag, i+1))
649                        stack.append((a, 0))
650                        break
651        raise StopIteration
652
653# Next, a couple classes to represent queries and their results.
654class SoupStrainer:
655    """Encapsulates a number of ways of matching a markup element (tag or
656    text)."""
657
658    def __init__(self, name=None, attrs={}, text=None, **kwargs):
659        self.name=name
660        self.attrs=attrs.copy()
661        self.attrs.update(kwargs)
662        self.text = text
663
664    def __str__(self):
665        if self.text:
666            return self.text
667        else:
668            return "%s|%s" % (self.name, self.attrs)
669
670    def searchTag(self, markupName=None, markupAttrs={}):
671        found = None
672        markup = None
673        if isinstance(markupName, Tag):
674            markup = markupName
675            markupAttrs = markup
676        callFunctionWithTagData = callable(self.name) \
677                                and not isinstance(markupName, Tag)
678
679        if (not self.name) \
680               or callFunctionWithTagData \
681               or (markup and self._matches(markup, self.name)) \
682               or (not markup and self._matches(markupName, self.name)):
683            if callFunctionWithTagData:
684                match = self.name(markupName, markupAttrs)
685            else:
686                match = True
687                markupAttrMap = None
688                for attr, matchAgainst in self.attrs.items():
689                    if not markupAttrMap:
690                         if hasattr(markupAttrs, 'get'):
691                            markupAttrMap = markupAttrs
692                         else:
693                            markupAttrMap = {}
694                            for k,v in markupAttrs:
695                                markupAttrMap[k] = v
696                    attrValue = markupAttrMap.get(attr)
697                    if not self._matches(attrValue, matchAgainst):
698                        match = False
699                        break
700            if match:
701                if markup:
702                    found = markup
703                else:
704                    found = markupName
705        return found
706
707    def search(self, markup):
708        #print 'looking for %s in %s' % (self, markup)
709        found = None
710        # If given a list of items, scan it for a text element that
711        # matches.
712        if isList(markup) and not isinstance(markup, Tag):
713            for element in markup:
714                if isinstance(element, NavigableString) \
715                       and self.search(element):
716                    found = element
717                    break
718        # If it's a Tag, make sure its name or attributes match.
719        # Don't bother with Tags if we're searching for text.
720        elif isinstance(markup, Tag):
721            if not self.text:
722                found = self.searchTag(markup)
723        # If it's text, make sure the text matches.
724        elif isinstance(markup, NavigableString) or \
725                 isString(markup):
726            if self._matches(markup, self.text):
727                found = markup
728        else:
729            raise Exception, "I don't know how to match against a %s" \
730                  % markup.__class__
731        return found
732
733    def _matches(self, markup, matchAgainst):
734        #print "Matching %s against %s" % (markup, matchAgainst)
735        result = False
736        if matchAgainst == True and type(matchAgainst) == types.BooleanType:
737            result = markup != None
738        elif callable(matchAgainst):
739            result = matchAgainst(markup)
740        else:
741            #Custom match methods take the tag as an argument, but all
742            #other ways of matching match the tag name as a string.
743            if isinstance(markup, Tag):
744                markup = markup.name
745            if markup and not isString(markup):
746                markup = unicode(markup)
747            #Now we know that chunk is either a string, or None.
748            if hasattr(matchAgainst, 'match'):
749                # It's a regexp object.
750                result = markup and matchAgainst.search(markup)
751            elif isList(matchAgainst):
752                result = markup in matchAgainst
753            elif hasattr(matchAgainst, 'items'):
754                result = markup.has_key(matchAgainst)
755            elif matchAgainst and isString(markup):
756                if isinstance(markup, unicode):
757                    matchAgainst = unicode(matchAgainst)
758                else:
759                    matchAgainst = str(matchAgainst)
760
761            if not result:
762                result = matchAgainst == markup
763        return result
764
765class ResultSet(list):
766    """A ResultSet is just a list that keeps track of the SoupStrainer
767    that created it."""
768    def __init__(self, source):
769        list.__init__([])
770        self.source = source
771
772# Now, some helper functions.
773
774def isList(l):
775    """Convenience method that works with all 2.x versions of Python
776    to determine whether or not something is listlike."""
777    return hasattr(l, '__iter__') \
778           or (type(l) in (types.ListType, types.TupleType))
779
780def isString(s):
781    """Convenience method that works with all 2.x versions of Python
782    to determine whether or not something is stringlike."""
783    try:
784        return isinstance(s, unicode) or isintance(s, basestring)
785    except NameError:
786        return isinstance(s, str)
787
788def buildTagMap(default, *args):
789    """Turns a list of maps, lists, or scalars into a single map.
790    Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
791    NESTING_RESET_TAGS maps out of lists and partial maps."""
792    built = {}
793    for portion in args:
794        if hasattr(portion, 'items'):
795            #It's a map. Merge it.
796            for k,v in portion.items():
797                built[k] = v
798        elif isList(portion):
799            #It's a list. Map each item to the default.
800            for k in portion:
801                built[k] = default
802        else:
803            #It's a scalar. Map it to the default.
804            built[portion] = default
805    return built
806
807# Now, the parser classes.
808
809class BeautifulStoneSoup(Tag, SGMLParser):
810
811    """This class contains the basic parser and search code. It defines
812    a parser that knows nothing about tag behavior except for the
813    following:
814
815      You can't close a tag without closing all the tags it encloses.
816      That is, "<foo><bar></foo>" actually means
817      "<foo><bar></bar></foo>".
818
819    [Another possible explanation is "<foo><bar /></foo>", but since
820    this class defines no SELF_CLOSING_TAGS, it will never use that
821    explanation.]
822
823    This class is useful for parsing XML or made-up markup languages,
824    or when BeautifulSoup makes an assumption counter to what you were
825    expecting."""
826
827    XML_ENTITY_LIST = {}
828    for i in ["quot", "apos", "amp", "lt", "gt"]:
829        XML_ENTITY_LIST[i] = True
830
831    SELF_CLOSING_TAGS = {}
832    NESTABLE_TAGS = {}
833    RESET_NESTING_TAGS = {}
834    QUOTE_TAGS = {}
835
836    MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
837                       lambda x: x.group(1) + ' />'),
838                      (re.compile('<!\s+([^<>]*)>'),
839                       lambda x: '<!' + x.group(1) + '>')
840                      ]
841
842    ROOT_TAG_NAME = u'[document]'
843
844    HTML_ENTITIES = "html"
845    XML_ENTITIES = "xml"
846
847    def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
848                 markupMassage=True, smartQuotesTo=XML_ENTITIES,
849                 convertEntities=None, selfClosingTags=None):
850        """The Soup object is initialized as the 'root tag', and the
851        provided markup (which can be a string or a file-like object)
852        is fed into the underlying parser.
853
854        sgmllib will process most bad HTML, and the BeautifulSoup
855        class has some tricks for dealing with some HTML that kills
856        sgmllib, but Beautiful Soup can nonetheless choke or lose data
857        if your data uses self-closing tags or declarations
858        incorrectly.
859
860        By default, Beautiful Soup uses regexes to sanitize input,
861        avoiding the vast majority of these problems. If the problems
862        don't apply to you, pass in False for markupMassage, and
863        you'll get better performance.
864
865        The default parser massage techniques fix the two most common
866        instances of invalid HTML that choke sgmllib:
867
868         <br/> (No space between name of closing tag and tag close)
869         <! --Comment--> (Extraneous whitespace in declaration)
870
871        You can pass in a custom list of (RE object, replace method)
872        tuples to get Beautiful Soup to scrub your input the way you
873        want."""
874
875        self.parseOnlyThese = parseOnlyThese
876        self.fromEncoding = fromEncoding
877        self.smartQuotesTo = smartQuotesTo
878        self.convertEntities = convertEntities
879        if self.convertEntities:
880            # It doesn't make sense to convert encoded characters to
881            # entities even while you're converting entities to Unicode.
882            # Just convert it all to Unicode.
883            self.smartQuotesTo = None
884        self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
885        SGMLParser.__init__(self)
886
887        if hasattr(markup, 'read'):        # It's a file-type object.
888            markup = markup.read()
889        self.markup = markup
890        self.markupMassage = markupMassage
891        try:
892            self._feed()
893        except StopParsing:
894            pass
895        self.markup = None                 # The markup can now be GCed
896
897    def _feed(self, inDocumentEncoding=None):
898        # Convert the document to Unicode.
899        markup = self.markup
900        if isinstance(markup, unicode):
901            if not hasattr(self, 'originalEncoding'):
902                self.originalEncoding = None
903        else:
904            dammit = UnicodeDammit\
905                     (markup, [self.fromEncoding, inDocumentEncoding],
906                      smartQuotesTo=self.smartQuotesTo)
907            markup = dammit.unicode
908            self.originalEncoding = dammit.originalEncoding
909        if markup:
910            if self.markupMassage:
911                if not isList(self.markupMassage):
912                    self.markupMassage = self.MARKUP_MASSAGE
913                for fix, m in self.markupMassage:
914                    markup = fix.sub(m, markup)
915        self.reset()
916
917        SGMLParser.feed(self, markup)
918        # Close out any unfinished strings and close all the open tags.
919        self.endData()
920        while self.currentTag.name != self.ROOT_TAG_NAME:
921            self.popTag()
922
923    def __getattr__(self, methodName):
924        """This method routes method call requests to either the SGMLParser
925        superclass or the Tag superclass, depending on the method name."""
926        #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
927
928        if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
929               or methodName.find('do_') == 0:
930            return SGMLParser.__getattr__(self, methodName)
931        elif methodName.find('__'):
932            return Tag.__getattr__(self, methodName)
933        else:
934            raise AttributeError
935
936    def isSelfClosingTag(self, name):
937        """Returns true iff the given string is the name of a
938        self-closing tag according to this parser."""
939        return self.SELF_CLOSING_TAGS.has_key(name) \
940               or self.instanceSelfClosingTags.has_key(name)
941
942    def reset(self):
943        Tag.__init__(self, self, self.ROOT_TAG_NAME)
944        self.hidden = 1
945        SGMLParser.reset(self)
946        self.currentData = []
947        self.currentTag = None
948        self.tagStack = []
949        self.quoteStack = []
950        self.pushTag(self)
951
952    def popTag(self):
953        tag = self.tagStack.pop()
954        # Tags with just one string-owning child get the child as a
955        # 'string' property, so that soup.tag.string is shorthand for
956        # soup.tag.contents[0]
957        if len(self.currentTag.contents) == 1 and \
958           isinstance(self.currentTag.contents[0], NavigableString):
959            self.currentTag.string = self.currentTag.contents[0]
960
961        #print "Pop", tag.name
962        if self.tagStack:
963            self.currentTag = self.tagStack[-1]
964        return self.currentTag
965
966    def pushTag(self, tag):
967        #print "Push", tag.name
968        if self.currentTag:
969            self.currentTag.append(tag)
970        self.tagStack.append(tag)
971        self.currentTag = self.tagStack[-1]
972
973    def endData(self, containerClass=NavigableString):
974        if self.currentData:
975            currentData = ''.join(self.currentData)
976            if not currentData.strip():
977                if '\n' in currentData:
978                    currentData = '\n'
979                else:
980                    currentData = ' '
981            self.currentData = []
982            if self.parseOnlyThese and len(self.tagStack) <= 1 and \
983                   (not self.parseOnlyThese.text or
984                    not self.parseOnlyThese.search(currentData)):
985                return
986            o = containerClass(currentData)
987            o.setup(self.currentTag, self.previous)
988            if self.previous:
989                self.previous.next = o
990            self.previous = o
991            self.currentTag.contents.append(o)
992
993
994    def _popToTag(self, name, inclusivePop=True):
995        """Pops the tag stack up to and including the most recent
996        instance of the given tag. If inclusivePop is false, pops the tag
997        stack up to but *not* including the most recent instqance of
998        the given tag."""
999        #print "Popping to %s" % name
1000        if name == self.ROOT_TAG_NAME:
1001            return
1002
1003        numPops = 0
1004        mostRecentTag = None
1005        for i in range(len(self.tagStack)-1, 0, -1):
1006            if name == self.tagStack[i].name:
1007                numPops = len(self.tagStack)-i
1008                break
1009        if not inclusivePop:
1010            numPops = numPops - 1
1011
1012        for i in range(0, numPops):
1013            mostRecentTag = self.popTag()
1014        return mostRecentTag
1015
1016    def _smartPop(self, name):
1017
1018        """We need to pop up to the previous tag of this type, unless
1019        one of this tag's nesting reset triggers comes between this
1020        tag and the previous tag of this type, OR unless this tag is a
1021        generic nesting trigger and another generic nesting trigger
1022        comes between this tag and the previous tag of this type.
1023
1024        Examples:
1025         <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
1026         <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
1027         <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
1028         <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
1029
1030         <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1031         <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1032         <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1033        """
1034
1035        nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1036        isNestable = nestingResetTriggers != None
1037        isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1038        popTo = None
1039        inclusive = True
1040        for i in range(len(self.tagStack)-1, 0, -1):
1041            p = self.tagStack[i]
1042            if (not p or p.name == name) and not isNestable:
1043                #Non-nestable tags get popped to the top or to their
1044                #last occurance.
1045                popTo = name
1046                break
1047            if (nestingResetTriggers != None
1048                and p.name in nestingResetTriggers) \
1049                or (nestingResetTriggers == None and isResetNesting
1050                    and self.RESET_NESTING_TAGS.has_key(p.name)):
1051
1052                #If we encounter one of the nesting reset triggers
1053                #peculiar to this tag, or we encounter another tag
1054                #that causes nesting to reset, pop up to but not
1055                #including that tag.
1056                popTo = p.name
1057                inclusive = False
1058                break
1059            p = p.parent
1060        if popTo:
1061            self._popToTag(popTo, inclusive)
1062
1063    def unknown_starttag(self, name, attrs, selfClosing=0):
1064        #print "Start tag %s" % name
1065        if self.quoteStack:
1066            #This is not a real tag.
1067            #print "<%s> is not real!" % name
1068            attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
1069            self.handle_data('<%s%s>' % (name, attrs))
1070            return
1071        self.endData()
1072
1073        if not self.isSelfClosingTag(name) and not selfClosing:
1074            self._smartPop(name)
1075
1076        if self.parseOnlyThese and len(self.tagStack) <= 1 \
1077               and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1078            return
1079
1080        tag = Tag(self, name, attrs, self.currentTag, self.previous)
1081        if self.previous:
1082            self.previous.next = tag
1083        self.previous = tag
1084        self.pushTag(tag)
1085        if selfClosing or self.isSelfClosingTag(name):
1086            self.popTag()
1087        if name in self.QUOTE_TAGS:
1088            #print "Beginning quote (%s)" % name
1089            self.quoteStack.append(name)
1090            self.literal = 1
1091        return tag
1092
1093    def unknown_endtag(self, name):
1094        #print "End tag %s" % name
1095        if self.quoteStack and self.quoteStack[-1] != name:
1096            #This is not a real end tag.
1097            #print "</%s> is not real!" % name
1098            self.handle_data('</%s>' % name)
1099            return
1100        self.endData()
1101        self._popToTag(name)
1102        if self.quoteStack and self.quoteStack[-1] == name:
1103            self.quoteStack.pop()
1104            self.literal = (len(self.quoteStack) > 0)
1105
1106    def handle_data(self, data):
1107        self.currentData.append(data)
1108
1109    def _toStringSubclass(self, text, subclass):
1110        """Adds a certain piece of text to the tree as a NavigableString
1111        subclass."""
1112        self.endData()
1113        self.handle_data(text)
1114        self.endData(subclass)
1115
1116    def handle_pi(self, text):
1117        """Handle a processing instruction as a ProcessingInstruction
1118        object, possibly one with a %SOUP-ENCODING% slot into which an
1119        encoding will be plugged later."""
1120        if text[:3] == "xml":
1121            text = "xml version='1.0' encoding='%SOUP-ENCODING%'"
1122        self._toStringSubclass(text, ProcessingInstruction)
1123
1124    def handle_comment(self, text):
1125        "Handle comments as Comment objects."
1126        self._toStringSubclass(text, Comment)
1127
1128    def handle_charref(self, ref):
1129        "Handle character references as data."
1130        if self.convertEntities in [self.HTML_ENTITIES,
1131                                    self.XML_ENTITIES]:
1132            data = unichr(int(ref))
1133        else:
1134            data = '&#%s;' % ref
1135        self.handle_data(data)
1136
1137    def handle_entityref(self, ref):
1138        """Handle entity references as data, possibly converting known
1139        HTML entity references to the corresponding Unicode
1140        characters."""
1141        data = None
1142        if self.convertEntities == self.HTML_ENTITIES or \
1143               (self.convertEntities == self.XML_ENTITIES and
1144                self.XML_ENTITY_LIST.get(ref)):
1145            try:
1146                data = unichr(name2codepoint[ref])
1147            except KeyError:
1148                pass
1149        if not data:
1150            data = '&%s;' % ref
1151        self.handle_data(data)
1152
1153    def handle_decl(self, data):
1154        "Handle DOCTYPEs and the like as Declaration objects."
1155        self._toStringSubclass(data, Declaration)
1156
1157    def parse_declaration(self, i):
1158        """Treat a bogus SGML declaration as raw data. Treat a CDATA
1159        declaration as a CData object."""
1160        j = None
1161        if self.rawdata[i:i+9] == '<![CDATA[':
1162             k = self.rawdata.find(']]>', i)
1163             if k == -1:
1164                 k = len(self.rawdata)
1165             data = self.rawdata[i+9:k]
1166             j = k+3
1167             self._toStringSubclass(data, CData)
1168        else:
1169            try:
1170                j = SGMLParser.parse_declaration(self, i)
1171            except SGMLParseError:
1172                toHandle = self.rawdata[i:]
1173                self.handle_data(toHandle)
1174                j = i + len(toHandle)
1175        return j
1176
1177class BeautifulSoup(BeautifulStoneSoup):
1178
1179    """This parser knows the following facts about HTML:
1180
1181    * Some tags have no closing tag and should be interpreted as being
1182      closed as soon as they are encountered.
1183
1184    * The text inside some tags (ie. 'script') may contain tags which
1185      are not really part of the document and which should be parsed
1186      as text, not tags. If you want to parse the text as tags, you can
1187      always fetch it and parse it explicitly.
1188
1189    * Tag nesting rules:
1190
1191      Most tags can't be nested at all. For instance, the occurance of
1192      a <p> tag should implicitly close the previous <p> tag.
1193
1194       <p>Para1<p>Para2
1195        should be transformed into:
1196       <p>Para1</p><p>Para2
1197
1198      Some tags can be nested arbitrarily. For instance, the occurance
1199      of a <blockquote> tag should _not_ implicitly close the previous
1200      <blockquote> tag.
1201
1202       Alice said: <blockquote>Bob said: <blockquote>Blah
1203        should NOT be transformed into:
1204       Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1205
1206      Some tags can be nested, but the nesting is reset by the
1207      interposition of other tags. For instance, a <tr> tag should
1208      implicitly close the previous <tr> tag within the same <table>,
1209      but not close a <tr> tag in another table.
1210
1211       <table><tr>Blah<tr>Blah
1212        should be transformed into:
1213       <table><tr>Blah</tr><tr>Blah
1214        but,
1215       <tr>Blah<table><tr>Blah
1216        should NOT be transformed into
1217       <tr>Blah<table></tr><tr>Blah
1218
1219    Differing assumptions about tag nesting rules are a major source
1220    of problems with the BeautifulSoup class. If BeautifulSoup is not
1221    treating as nestable a tag your page author treats as nestable,
1222    try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1223    BeautifulStoneSoup before writing your own subclass."""
1224
1225    def __init__(self, *args, **kwargs):
1226        if not kwargs.has_key('smartQuotesTo'):
1227            kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1228        BeautifulStoneSoup.__init__(self, *args, **kwargs)
1229
1230    SELF_CLOSING_TAGS = buildTagMap(None,
1231                                    ['br' , 'hr', 'input', 'img', 'meta',
1232                                    'spacer', 'link', 'frame', 'base'])
1233
1234    QUOTE_TAGS = {'script': None}
1235
1236    #According to the HTML standard, each of these inline tags can
1237    #contain another tag of the same type. Furthermore, it's common
1238    #to actually use these tags this way.
1239    NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1240                            'center']
1241
1242    #According to the HTML standard, these block tags can contain
1243    #another tag of the same type. Furthermore, it's common
1244    #to actually use these tags this way.
1245    NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
1246
1247    #Lists can contain other lists, but there are restrictions.
1248    NESTABLE_LIST_TAGS = { 'ol' : [],
1249                           'ul' : [],
1250                           'li' : ['ul', 'ol'],
1251                           'dl' : [],
1252                           'dd' : ['dl'],
1253                           'dt' : ['dl'] }
1254
1255    #Tables can contain other tables, but there are restrictions.
1256    NESTABLE_TABLE_TAGS = {'table' : [],
1257                           'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1258                           'td' : ['tr'],
1259                           'th' : ['tr'],
1260                           'thead' : ['table'],
1261                           'tbody' : ['table'],
1262                           'tfoot' : ['table'],
1263                           }
1264
1265    NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
1266
1267    #If one of these tags is encountered, all tags up to the next tag of
1268    #this type are popped.
1269    RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1270                                     NON_NESTABLE_BLOCK_TAGS,
1271                                     NESTABLE_LIST_TAGS,
1272                                     NESTABLE_TABLE_TAGS)
1273
1274    NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1275                                NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1276
1277    # Used to detect the charset in a META tag; see start_meta
1278    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
1279
1280    def start_meta(self, attrs):
1281        """Beautiful Soup can detect a charset included in a META tag,
1282        try to convert the document to that charset, and re-parse the
1283        document from the beginning."""
1284        httpEquiv = None
1285        contentType = None
1286        contentTypeIndex = None
1287        tagNeedsEncodingSubstitution = False
1288
1289        for i in range(0, len(attrs)):
1290            key, value = attrs[i]
1291            key = key.lower()
1292            if key == 'http-equiv':
1293                httpEquiv = value
1294            elif key == 'content':
1295                contentType = value
1296                contentTypeIndex = i
1297
1298        if httpEquiv and contentType: # It's an interesting meta tag.
1299            match = self.CHARSET_RE.search(contentType)
1300            if match:
1301                if getattr(self, 'declaredHTMLEncoding') or \
1302                       (self.originalEncoding == self.fromEncoding):
1303                    # This is our second pass through the document, or
1304                    # else an encoding was specified explicitly and it
1305                    # worked. Rewrite the meta tag.
1306                    newAttr = self.CHARSET_RE.sub\
1307                              (lambda(match):match.group(1) +
1308                               "%SOUP-ENCODING%", value)
1309                    attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1310                                               newAttr)
1311                    tagNeedsEncodingSubstitution = True
1312                else:
1313                    # This is our first pass through the document.
1314                    # Go through it again with the new information.
1315                    newCharset = match.group(3)
1316                    if newCharset and newCharset != self.originalEncoding:
1317                        self.declaredHTMLEncoding = newCharset
1318                        self._feed(self.declaredHTMLEncoding)
1319                        raise StopParsing
1320        tag = self.unknown_starttag("meta", attrs)
1321        if tagNeedsEncodingSubstitution:
1322            tag.containsSubstitutions = True
1323
1324class StopParsing(Exception):
1325    pass
1326
1327class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1328
1329    """The BeautifulSoup class is oriented towards skipping over
1330    common HTML errors like unclosed tags. However, sometimes it makes
1331    errors of its own. For instance, consider this fragment:
1332
1333     <b>Foo<b>Bar</b></b>
1334
1335    This is perfectly valid (if bizarre) HTML. However, the
1336    BeautifulSoup class will implicitly close the first b tag when it
1337    encounters the second 'b'. It will think the author wrote
1338    "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1339    there's no real-world reason to bold something that's already
1340    bold. When it encounters '</b></b>' it will close two more 'b'
1341    tags, for a grand total of three tags closed instead of two. This
1342    can throw off the rest of your document structure. The same is
1343    true of a number of other tags, listed below.
1344
1345    It's much more common for someone to forget to close a 'b' tag
1346    than to actually use nested 'b' tags, and the BeautifulSoup class
1347    handles the common case. This class handles the not-co-common
1348    case: where you can't believe someone wrote what they did, but
1349    it's valid HTML and BeautifulSoup screwed up by assuming it
1350    wouldn't be."""
1351
1352    I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1353     ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1354      'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1355      'big']
1356
1357    I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
1358
1359    NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1360                                I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1361                                I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1362
1363class MinimalSoup(BeautifulSoup):
1364    """The MinimalSoup class is for parsing HTML that contains
1365    pathologically bad markup. It makes no assumptions about tag
1366    nesting, but it does know which tags are self-closing, that
1367    <script> tags contain Javascript and should not be parsed, that
1368    META tags may contain encoding information, and so on.
1369
1370    This also makes it better for subclassing than BeautifulStoneSoup
1371    or BeautifulSoup."""
1372
1373    RESET_NESTING_TAGS = buildTagMap('noscript')
1374    NESTABLE_TAGS = {}
1375
1376class BeautifulSOAP(BeautifulStoneSoup):
1377    """This class will push a tag with only a single string child into
1378    the tag's parent as an attribute. The attribute's name is the tag
1379    name, and the value is the string child. An example should give
1380    the flavor of the change:
1381
1382    <foo><bar>baz</bar></foo>
1383     =>
1384    <foo bar="baz"><bar>baz</bar></foo>
1385
1386    You can then access fooTag['bar'] instead of fooTag.barTag.string.
1387
1388    This is, of course, useful for scraping structures that tend to
1389    use subelements instead of attributes, such as SOAP messages. Note
1390    that it modifies its input, so don't print the modified version
1391    out.
1392
1393    I'm not sure how many people really want to use this class; let me
1394    know if you do. Mainly I like the name."""
1395
1396    def popTag(self):
1397        if len(self.tagStack) > 1:
1398            tag = self.tagStack[-1]
1399            parent = self.tagStack[-2]
1400            parent._getAttrMap()
1401            if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1402                isinstance(tag.contents[0], NavigableString) and
1403                not parent.attrMap.has_key(tag.name)):
1404                parent[tag.name] = tag.contents[0]
1405        BeautifulStoneSoup.popTag(self)
1406
1407#Enterprise class names! It has come to our attention that some people
1408#think the names of the Beautiful Soup parser classes are too silly
1409#and "unprofessional" for use in enterprise screen-scraping. We feel
1410#your pain! For such-minded folk, the Beautiful Soup Consortium And
1411#All-Night Kosher Bakery recommends renaming this file to
1412#"RobustParser.py" (or, in cases of extreme enterprisitude,
1413#"RobustParserBeanInterface.class") and using the following
1414#enterprise-friendly class aliases:
1415class RobustXMLParser(BeautifulStoneSoup):
1416    pass
1417class RobustHTMLParser(BeautifulSoup):
1418    pass
1419class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1420    pass
1421class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1422    pass
1423class SimplifyingSOAPParser(BeautifulSOAP):
1424    pass
1425
1426######################################################
1427#
1428# Bonus library: Unicode, Dammit
1429#
1430# This class forces XML data into a standard format (usually to UTF-8
1431# or Unicode).  It is heavily based on code from Mark Pilgrim's
1432# Universal Feed Parser. It does not rewrite the XML or HTML to
1433# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1434# (XML) and BeautifulSoup.start_meta (HTML).
1435
1436# Autodetects character encodings.
1437# Download from http://chardet.feedparser.org/
1438try:
1439    import chardet
1440#    import chardet.constants
1441#    chardet.constants._debug = 1
1442except:
1443    chardet = None
1444chardet = None
1445
1446# cjkcodecs and iconv_codec make Python know about more character encodings.
1447# Both are available from http://cjkpython.i18n.org/
1448# They're built in if you use Python 2.4.
1449try:
1450    import cjkcodecs.aliases
1451except:
1452    pass
1453try:
1454    import iconv_codec
1455except:
1456    pass
1457
1458class UnicodeDammit:
1459    """A class for detecting the encoding of a *ML document and
1460    converting it to a Unicode string. If the source encoding is
1461    windows-1252, can replace MS smart quotes with their HTML or XML
1462    equivalents."""
1463
1464    # This dictionary maps commonly seen values for "charset" in HTML
1465    # meta tags to the corresponding Python codec names. It only covers
1466    # values that aren't in Python's aliases and can't be determined
1467    # by the heuristics in find_codec.
1468    CHARSET_ALIASES = { "macintosh" : "mac-roman",
1469                        "x-sjis" : "shift-jis" }
1470
1471    def __init__(self, markup, overrideEncodings=[],
1472                 smartQuotesTo='xml'):
1473        self.markup, documentEncoding, sniffedEncoding = \
1474                     self._detectEncoding(markup)
1475        self.smartQuotesTo = smartQuotesTo
1476        self.triedEncodings = []
1477        if isinstance(markup, unicode):
1478            return markup
1479
1480        u = None
1481        for proposedEncoding in overrideEncodings:
1482            u = self._convertFrom(proposedEncoding)
1483            if u: break
1484        if not u:
1485            for proposedEncoding in (documentEncoding, sniffedEncoding):
1486                u = self._convertFrom(proposedEncoding)
1487                if u: break
1488
1489        # If no luck and we have auto-detection library, try that:
1490        if not u and chardet and not isinstance(self.markup, unicode):
1491            u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1492
1493        # As a last resort, try utf-8 and windows-1252:
1494        if not u:
1495            for proposed_encoding in ("utf-8", "windows-1252"):
1496                u = self._convertFrom(proposed_encoding)
1497                if u: break
1498        self.unicode = u
1499        if not u: self.originalEncoding = None
1500
1501    def _subMSChar(self, orig):
1502        """Changes a MS smart quote character to an XML or HTML
1503        entity."""
1504        sub = self.MS_CHARS.get(orig)
1505        if type(sub) == types.TupleType:
1506            if self.smartQuotesTo == 'xml':
1507                sub = '&#x%s;' % sub[1]
1508            else:
1509                sub = '&%s;' % sub[0]
1510        return sub
1511
1512    def _convertFrom(self, proposed):
1513        proposed = self.find_codec(proposed)
1514        if not proposed or proposed in self.triedEncodings:
1515            return None
1516        self.triedEncodings.append(proposed)
1517        markup = self.markup
1518
1519        # Convert smart quotes to HTML if coming from an encoding
1520        # that might have them.
1521        if self.smartQuotesTo and proposed in("windows-1252",
1522                                              "ISO-8859-1",
1523                                              "ISO-8859-2"):
1524            markup = re.compile("([\x80-\x9f])").sub \
1525                     (lambda(x): self._subMSChar(x.group(1)),
1526                      markup)
1527
1528        try:
1529            # print "Trying to convert document to %s" % proposed
1530            u = self._toUnicode(markup, proposed)
1531            self.markup = u
1532            self.originalEncoding = proposed
1533        except Exception, e:
1534            # print "That didn't work!"
1535            # print e
1536            return None
1537        #print "Correct encoding: %s" % proposed
1538        return self.markup
1539
1540    def _toUnicode(self, data, encoding):
1541        '''Given a string and its encoding, decodes the string into Unicode.
1542        %encoding is a string recognized by encodings.aliases'''
1543
1544        # strip Byte Order Mark (if present)
1545        if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1546               and (data[2:4] != '\x00\x00'):
1547            encoding = 'utf-16be'
1548            data = data[2:]
1549        elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1550                 and (data[2:4] != '\x00\x00'):
1551            encoding = 'utf-16le'
1552            data = data[2:]
1553        elif data[:3] == '\xef\xbb\xbf':
1554            encoding = 'utf-8'
1555            data = data[3:]
1556        elif data[:4] == '\x00\x00\xfe\xff':
1557            encoding = 'utf-32be'
1558            data = data[4:]
1559        elif data[:4] == '\xff\xfe\x00\x00':
1560            encoding = 'utf-32le'
1561            data = data[4:]
1562        newdata = unicode(data, encoding)
1563        return newdata
1564
1565    def _detectEncoding(self, xml_data):
1566        """Given a document, tries to detect its XML encoding."""
1567        xml_encoding = sniffed_xml_encoding = None
1568        try:
1569            if xml_data[:4] == '\x4c\x6f\xa7\x94':
1570                # EBCDIC
1571                xml_data = self._ebcdic_to_ascii(xml_data)
1572            elif xml_data[:4] == '\x00\x3c\x00\x3f':
1573                # UTF-16BE
1574                sniffed_xml_encoding = 'utf-16be'
1575                xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1576            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1577                     and (xml_data[2:4] != '\x00\x00'):
1578                # UTF-16BE with BOM
1579                sniffed_xml_encoding = 'utf-16be'
1580                xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1581            elif xml_data[:4] == '\x3c\x00\x3f\x00':
1582                # UTF-16LE
1583                sniffed_xml_encoding = 'utf-16le'
1584                xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1585            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1586                     (xml_data[2:4] != '\x00\x00'):
1587                # UTF-16LE with BOM
1588                sniffed_xml_encoding = 'utf-16le'
1589                xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1590            elif xml_data[:4] == '\x00\x00\x00\x3c':
1591                # UTF-32BE
1592                sniffed_xml_encoding = 'utf-32be'
1593                xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1594            elif xml_data[:4] == '\x3c\x00\x00\x00':
1595                # UTF-32LE
1596                sniffed_xml_encoding = 'utf-32le'
1597                xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1598            elif xml_data[:4] == '\x00\x00\xfe\xff':
1599                # UTF-32BE with BOM
1600                sniffed_xml_encoding = 'utf-32be'
1601                xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1602            elif xml_data[:4] == '\xff\xfe\x00\x00':
1603                # UTF-32LE with BOM
1604                sniffed_xml_encoding = 'utf-32le'
1605                xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1606            elif xml_data[:3] == '\xef\xbb\xbf':
1607                # UTF-8 with BOM
1608                sniffed_xml_encoding = 'utf-8'
1609                xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1610            else:
1611                sniffed_xml_encoding = 'ascii'
1612                pass
1613            xml_encoding_match = re.compile \
1614                                 ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
1615                                 .match(xml_data)
1616        except:
1617            xml_encoding_match = None
1618        if xml_encoding_match:
1619            xml_encoding = xml_encoding_match.groups()[0].lower()
1620            if sniffed_xml_encoding and \
1621               (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1622                                 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1623                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
1624                                 'utf16', 'u16')):
1625                xml_encoding = sniffed_xml_encoding
1626        return xml_data, xml_encoding, sniffed_xml_encoding
1627
1628
1629    def find_codec(self, charset):
1630        return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1631               or (charset and self._codec(charset.replace("-", ""))) \
1632               or (charset and self._codec(charset.replace("-", "_"))) \
1633               or charset
1634
1635    def _codec(self, charset):
1636        if not charset: return charset
1637        codec = None
1638        try:
1639            codecs.lookup(charset)
1640            codec = charset
1641        except LookupError:
1642            pass
1643        return codec
1644
1645    EBCDIC_TO_ASCII_MAP = None
1646    def _ebcdic_to_ascii(self, s):
1647        c = self.__class__
1648        if not c.EBCDIC_TO_ASCII_MAP:
1649            emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1650                    16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1651                    128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1652                    144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1653                    32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1654                    38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1655                    45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1656                    186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1657                    195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1658                    201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1659                    206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1660                    211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1661                    225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1662                    73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1663                    82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1664                    90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1665                    250,251,252,253,254,255)
1666            import string
1667            c.EBCDIC_TO_ASCII_MAP = string.maketrans(
1668                ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1669        return s.translate(c.EBCDIC_TO_ASCII_MAP)
1670
1671    MS_CHARS = { '\x80' : ('euro', '20AC'),
1672                 '\x81' : ' ',
1673                 '\x82' : ('sbquo', '201A'),
1674                 '\x83' : ('fnof', '192'),
1675                 '\x84' : ('bdquo', '201E'),
1676                 '\x85' : ('hellip', '2026'),
1677                 '\x86' : ('dagger', '2020'),
1678                 '\x87' : ('Dagger', '2021'),
1679                 '\x88' : ('circ', '2C6'),
1680                 '\x89' : ('permil', '2030'),
1681                 '\x8A' : ('Scaron', '160'),
1682                 '\x8B' : ('lsaquo', '2039'),
1683                 '\x8C' : ('OElig', '152'),
1684                 '\x8D' : '?',
1685                 '\x8E' : ('#x17D', '17D'),
1686                 '\x8F' : '?',
1687                 '\x90' : '?',
1688                 '\x91' : ('lsquo', '2018'),
1689                 '\x92' : ('rsquo', '2019'),
1690                 '\x93' : ('ldquo', '201C'),
1691                 '\x94' : ('rdquo', '201D'),
1692                 '\x95' : ('bull', '2022'),
1693                 '\x96' : ('ndash', '2013'),
1694                 '\x97' : ('mdash', '2014'),
1695                 '\x98' : ('tilde', '2DC'),
1696                 '\x99' : ('trade', '2122'),
1697                 '\x9a' : ('scaron', '161'),
1698                 '\x9b' : ('rsaquo', '203A'),
1699                 '\x9c' : ('oelig', '153'),
1700                 '\x9d' : '?',
1701                 '\x9e' : ('#x17E', '17E'),
1702                 '\x9f' : ('Yuml', ''),}
1703
1704#######################################################################
1705
1706
1707#By default, act as an HTML pretty-printer.
1708if __name__ == '__main__':
1709    import sys
1710    soup = BeautifulStoneSoup(sys.stdin.read())
1711    print soup.prettify()
1712