1"""Beautiful Soup 2Elixir and Tonic 3"The Screen-Scraper's Friend" 4v3.0.0 5http://www.crummy.com/software/BeautifulSoup/ 6 7Beautiful Soup parses a (possibly invalid) XML or HTML document into a 8tree representation. It provides methods and Pythonic idioms that make 9it easy to navigate, search, and modify the tree. 10 11A well-formed XML/HTML document yields a well-formed data 12structure. An ill-formed XML/HTML document yields a correspondingly 13ill-formed data structure. If your document is only locally 14well-formed, you can use this library to find and process the 15well-formed part of it. 16 17Beautiful Soup works with Python 2.2 and up. It has no external 18dependencies, but you'll have more success at converting data to UTF-8 19if you also install these three packages: 20 21* chardet, for auto-detecting character encodings 22 http://chardet.feedparser.org/ 23* cjkcodecs and iconv_codec, which add more encodings to the ones supported 24 by stock Python. 25 http://cjkpython.i18n.org/ 26 27Beautiful Soup defines classes for two main parsing strategies: 28 29 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific 30 language that kind of looks like XML. 31 32 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid 33 or invalid. This class has web browser-like heuristics for 34 obtaining a sensible parse tree in the face of common HTML errors. 35 36Beautiful Soup also defines a class (UnicodeDammit) for autodetecting 37the encoding of an HTML or XML document, and converting it to 38Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. 39 40For more than you ever wanted to know about Beautiful Soup, see the 41documentation: 42http://www.crummy.com/software/BeautifulSoup/documentation.html 43 44""" 45from __future__ import generators 46 47__author__ = "Leonard Richardson (leonardr@segfault.org)" 48__version__ = "3.0.0" 49__date__ = "$Date: 2004/10/18 00:14:20 $" 50__copyright__ = "Copyright (c) 2004-2005 Leonard Richardson" 51__license__ = "PSF" 52 53from sgmllib import SGMLParser, SGMLParseError 54import codecs 55import types 56import re 57import sgmllib 58from htmlentitydefs import name2codepoint 59 60#This code makes Beautiful Soup able to parse XML with namespaces 61sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') 62 63DEFAULT_OUTPUT_ENCODING = "utf-8" 64 65# First, the classes that represent markup elements. 66 67class PageElement: 68 """Contains the navigational information for some part of the page 69 (either a tag or a piece of text)""" 70 71 def setup(self, parent=None, previous=None): 72 """Sets up the initial relations between this element and 73 other elements.""" 74 self.parent = parent 75 self.previous = previous 76 self.next = None 77 self.previousSibling = None 78 self.nextSibling = None 79 if self.parent and self.parent.contents: 80 self.previousSibling = self.parent.contents[-1] 81 self.previousSibling.nextSibling = self 82 83 def replaceWith(self, replaceWith): 84 oldParent = self.parent 85 myIndex = self.parent.contents.index(self) 86 if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent: 87 # We're replacing this element with one of its siblings. 88 index = self.parent.contents.index(replaceWith) 89 if index and index < myIndex: 90 # Furthermore, it comes before this element. That 91 # means that when we extract it, the index of this 92 # element will change. 93 myIndex = myIndex - 1 94 self.extract() 95 oldParent.insert(myIndex, replaceWith) 96 97 def extract(self): 98 """Destructively rips this element out of the tree.""" 99 if self.parent: 100 try: 101 self.parent.contents.remove(self) 102 except ValueError: 103 pass 104 105 #Find the two elements that would be next to each other if 106 #this element (and any children) hadn't been parsed. Connect 107 #the two. 108 lastChild = self._lastRecursiveChild() 109 nextElement = lastChild.next 110 111 if self.previous: 112 self.previous.next = nextElement 113 if nextElement: 114 nextElement.previous = self.previous 115 self.previous = None 116 lastChild.next = None 117 118 self.parent = None 119 if self.previousSibling: 120 self.previousSibling.nextSibling = self.nextSibling 121 if self.nextSibling: 122 self.nextSibling.previousSibling = self.previousSibling 123 self.previousSibling = self.nextSibling = None 124 125 def _lastRecursiveChild(self): 126 "Finds the last element beneath this object to be parsed." 127 lastChild = self 128 while hasattr(lastChild, 'contents') and lastChild.contents: 129 lastChild = lastChild.contents[-1] 130 return lastChild 131 132 def insert(self, position, newChild): 133 if (isinstance(newChild, basestring) 134 or isinstance(newChild, unicode)) \ 135 and not isinstance(newChild, NavigableString): 136 newChild = NavigableString(newChild) 137 138 position = min(position, len(self.contents)) 139 if hasattr(newChild, 'parent') and newChild.parent != None: 140 # We're 'inserting' an element that's already one 141 # of this object's children. 142 if newChild.parent == self: 143 index = self.find(newChild) 144 if index and index < position: 145 # Furthermore we're moving it further down the 146 # list of this object's children. That means that 147 # when we extract this element, our target index 148 # will jump down one. 149 position = position - 1 150 newChild.extract() 151 152 newChild.parent = self 153 previousChild = None 154 if not position: 155 newChild.previousSibling = None 156 newChild.previous = self 157 else: 158 previousChild = self.contents[position-1] 159 newChild.previousSibling = previousChild 160 newChild.previousSibling.nextSibling = newChild 161 newChild.previous = previousChild._lastRecursiveChild() 162 if newChild.previous: 163 newChild.previous.next = newChild 164 165 newChildsLastElement = newChild._lastRecursiveChild() 166 167 if position >= len(self.contents): 168 newChild.nextSibling = None 169 170 parent = self 171 parentsNextSibling = None 172 while not parentsNextSibling: 173 parentsNextSibling = parent.nextSibling 174 parent = parent.parent 175 if not parent: # This is the last element in the document. 176 break 177 if parentsNextSibling: 178 newChildsLastElement.next = parentsNextSibling 179 else: 180 newChildsLastElement.next = None 181 else: 182 nextChild = self.contents[position] 183 newChild.nextSibling = nextChild 184 if newChild.nextSibling: 185 newChild.nextSibling.previousSibling = newChild 186 newChildsLastElement.next = nextChild 187 188 if newChildsLastElement.next: 189 newChildsLastElement.next.previous = newChildsLastElement 190 self.contents.insert(position, newChild) 191 192 def findNext(self, name=None, attrs={}, text=None, **kwargs): 193 """Returns the first item that matches the given criteria and 194 appears after this Tag in the document.""" 195 return self._findOne(self.findAllNext, name, attrs, text, **kwargs) 196 197 def findAllNext(self, name=None, attrs={}, text=None, limit=None, 198 **kwargs): 199 """Returns all items that match the given criteria and appear 200 before after Tag in the document.""" 201 return self._findAll(name, attrs, text, limit, self.nextGenerator) 202 203 def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): 204 """Returns the closest sibling to this Tag that matches the 205 given criteria and appears after this Tag in the document.""" 206 return self._findOne(self.findNextSiblings, name, attrs, text, 207 **kwargs) 208 209 def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, 210 **kwargs): 211 """Returns the siblings of this Tag that match the given 212 criteria and appear after this Tag in the document.""" 213 return self._findAll(name, attrs, text, limit, 214 self.nextSiblingGenerator, **kwargs) 215 216 def findPrevious(self, name=None, attrs={}, text=None, **kwargs): 217 """Returns the first item that matches the given criteria and 218 appears before this Tag in the document.""" 219 return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) 220 221 def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, 222 **kwargs): 223 """Returns all items that match the given criteria and appear 224 before this Tag in the document.""" 225 return self._findAll(name, attrs, text, limit, self.previousGenerator, 226 **kwargs) 227 228 def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): 229 """Returns the closest sibling to this Tag that matches the 230 given criteria and appears before this Tag in the document.""" 231 return self._findOne(self.findPreviousSiblings, name, attrs, text, 232 **kwargs) 233 234 def findPreviousSiblings(self, name=None, attrs={}, text=None, 235 limit=None, **kwargs): 236 """Returns the siblings of this Tag that match the given 237 criteria and appear before this Tag in the document.""" 238 return self._findAll(name, attrs, text, limit, 239 self.previousSiblingGenerator, **kwargs) 240 241 def findParent(self, name=None, attrs={}, **kwargs): 242 """Returns the closest parent of this Tag that matches the given 243 criteria.""" 244 # NOTE: We can't use _findOne because findParents takes a different 245 # set of arguments. 246 r = None 247 l = self.findParents(name, attrs, 1) 248 if l: 249 r = l[0] 250 return r 251 252 def findParents(self, name=None, attrs={}, limit=None, **kwargs): 253 """Returns the parents of this Tag that match the given 254 criteria.""" 255 256 return self._findAll(name, attrs, None, limit, self.parentGenerator, 257 **kwargs) 258 259 #These methods do the real heavy lifting. 260 261 def _findOne(self, method, name, attrs, text, **kwargs): 262 r = None 263 l = method(name, attrs, text, 1, **kwargs) 264 if l: 265 r = l[0] 266 return r 267 268 def _findAll(self, name, attrs, text, limit, generator, **kwargs): 269 "Iterates over a generator looking for things that match." 270 271 if isinstance(name, SoupStrainer): 272 strainer = name 273 else: 274 # Build a SoupStrainer 275 strainer = SoupStrainer(name, attrs, text, **kwargs) 276 results = ResultSet(strainer) 277 g = generator() 278 while True: 279 try: 280 i = g.next() 281 except StopIteration: 282 break 283 if i: 284 found = strainer.search(i) 285 if found: 286 results.append(found) 287 if limit and len(results) >= limit: 288 break 289 return results 290 291 #These Generators can be used to navigate starting from both 292 #NavigableStrings and Tags. 293 def nextGenerator(self): 294 i = self 295 while i: 296 i = i.next 297 yield i 298 299 def nextSiblingGenerator(self): 300 i = self 301 while i: 302 i = i.nextSibling 303 yield i 304 305 def previousGenerator(self): 306 i = self 307 while i: 308 i = i.previous 309 yield i 310 311 def previousSiblingGenerator(self): 312 i = self 313 while i: 314 i = i.previousSibling 315 yield i 316 317 def parentGenerator(self): 318 i = self 319 while i: 320 i = i.parent 321 yield i 322 323 # Utility methods 324 def substituteEncoding(self, str, encoding=None): 325 encoding = encoding or "utf-8" 326 return str.replace("%SOUP-ENCODING%", encoding) 327 328 def toEncoding(self, s, encoding=None): 329 """Encodes an object to a string in some encoding, or to Unicode. 330 .""" 331 if isinstance(s, unicode): 332 if encoding: 333 s = s.encode(encoding) 334 elif isinstance(s, str): 335 if encoding: 336 s = s.encode(encoding) 337 else: 338 s = unicode(s) 339 else: 340 if encoding: 341 s = self.toEncoding(str(s), encoding) 342 else: 343 s = unicode(s) 344 return s 345 346class NavigableString(unicode, PageElement): 347 348 def __getattr__(self, attr): 349 """text.string gives you text. This is for backwards 350 compatibility for Navigable*String, but for CData* it lets you 351 get the string without the CData wrapper.""" 352 if attr == 'string': 353 return self 354 else: 355 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) 356 357 def __unicode__(self): 358 return self.__str__(None) 359 360 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 361 if encoding: 362 return self.encode(encoding) 363 else: 364 return self 365 366class CData(NavigableString): 367 368 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 369 return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding) 370 371class ProcessingInstruction(NavigableString): 372 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 373 output = self 374 if "%SOUP-ENCODING%" in output: 375 output = self.substituteEncoding(output, encoding) 376 return "<?%s?>" % self.toEncoding(output, encoding) 377 378class Comment(NavigableString): 379 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 380 return "<!--%s-->" % NavigableString.__str__(self, encoding) 381 382class Declaration(NavigableString): 383 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 384 return "<!%s>" % NavigableString.__str__(self, encoding) 385 386class Tag(PageElement): 387 388 """Represents a found HTML tag with its attributes and contents.""" 389 390 def __init__(self, parser, name, attrs=None, parent=None, 391 previous=None): 392 "Basic constructor." 393 394 # We don't actually store the parser object: that lets extracted 395 # chunks be garbage-collected 396 self.parserClass = parser.__class__ 397 self.isSelfClosing = parser.isSelfClosingTag(name) 398 self.name = name 399 if attrs == None: 400 attrs = [] 401 self.attrs = attrs 402 self.contents = [] 403 self.setup(parent, previous) 404 self.hidden = False 405 self.containsSubstitutions = False 406 407 def get(self, key, default=None): 408 """Returns the value of the 'key' attribute for the tag, or 409 the value given for 'default' if it doesn't have that 410 attribute.""" 411 return self._getAttrMap().get(key, default) 412 413 def has_key(self, key): 414 return self._getAttrMap().has_key(key) 415 416 def __getitem__(self, key): 417 """tag[key] returns the value of the 'key' attribute for the tag, 418 and throws an exception if it's not there.""" 419 return self._getAttrMap()[key] 420 421 def __iter__(self): 422 "Iterating over a tag iterates over its contents." 423 return iter(self.contents) 424 425 def __len__(self): 426 "The length of a tag is the length of its list of contents." 427 return len(self.contents) 428 429 def __contains__(self, x): 430 return x in self.contents 431 432 def __nonzero__(self): 433 "A tag is non-None even if it has no contents." 434 return True 435 436 def __setitem__(self, key, value): 437 """Setting tag[key] sets the value of the 'key' attribute for the 438 tag.""" 439 self._getAttrMap() 440 self.attrMap[key] = value 441 found = False 442 for i in range(0, len(self.attrs)): 443 if self.attrs[i][0] == key: 444 self.attrs[i] = (key, value) 445 found = True 446 if not found: 447 self.attrs.append((key, value)) 448 self._getAttrMap()[key] = value 449 450 def __delitem__(self, key): 451 "Deleting tag[key] deletes all 'key' attributes for the tag." 452 for item in self.attrs: 453 if item[0] == key: 454 self.attrs.remove(item) 455 #We don't break because bad HTML can define the same 456 #attribute multiple times. 457 self._getAttrMap() 458 if self.attrMap.has_key(key): 459 del self.attrMap[key] 460 461 def __call__(self, *args, **kwargs): 462 """Calling a tag like a function is the same as calling its 463 findAll() method. Eg. tag('a') returns a list of all the A tags 464 found within this tag.""" 465 return apply(self.findAll, args, kwargs) 466 467 def __getattr__(self, tag): 468 #print "Getattr %s.%s" % (self.__class__, tag) 469 if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: 470 return self.find(tag[:-3]) 471 elif tag.find('__'): 472 return self.find(tag) 473 474 def __eq__(self, other): 475 """Returns true iff this tag has the same name, the same attributes, 476 and the same contents (recursively) as the given tag. 477 478 NOTE: right now this will return false if two tags have the 479 same attributes in a different order. Should this be fixed?""" 480 if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): 481 return False 482 for i in range(0, len(self.contents)): 483 if self.contents[i] != other.contents[i]: 484 return False 485 return True 486 487 def __ne__(self, other): 488 """Returns true iff this tag is not identical to the other tag, 489 as defined in __eq__.""" 490 return not self == other 491 492 def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): 493 """Renders this tag as a string.""" 494 return self.__str__(encoding) 495 496 def __unicode__(self): 497 return self.__str__(None) 498 499 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, 500 prettyPrint=False, indentLevel=0): 501 """Returns a string or Unicode representation of this tag and 502 its contents. To get Unicode, pass None for encoding. 503 504 NOTE: since Python's HTML parser consumes whitespace, this 505 method is not certain to reproduce the whitespace present in 506 the original string.""" 507 508 encodedName = self.toEncoding(self.name, encoding) 509 510 attrs = [] 511 if self.attrs: 512 for key, val in self.attrs: 513 fmt = '%s="%s"' 514 if isString(val): 515 if self.containsSubstitutions and '%SOUP-ENCODING%' in val: 516 val = self.substituteEncoding(val, encoding) 517 if '"' in val: 518 fmt = "%s='%s'" 519 # This can't happen naturally, but it can happen 520 # if you modify an attribute value and print it out. 521 if "'" in val: 522 val = val.replace("'", "&squot;") 523 attrs.append(fmt % (self.toEncoding(key, encoding), 524 self.toEncoding(val, encoding))) 525 close = '' 526 closeTag = '' 527 if self.isSelfClosing: 528 close = ' /' 529 else: 530 closeTag = '</%s>' % encodedName 531 532 indentTag, indentContents = 0, 0 533 if prettyPrint: 534 indentTag = indentLevel 535 space = (' ' * (indentTag-1)) 536 indentContents = indentTag + 1 537 contents = self.renderContents(encoding, prettyPrint, indentContents) 538 if self.hidden: 539 s = contents 540 else: 541 s = [] 542 attributeString = '' 543 if attrs: 544 attributeString = ' ' + ' '.join(attrs) 545 if prettyPrint: 546 s.append(space) 547 s.append('<%s%s%s>' % (encodedName, attributeString, close)) 548 if prettyPrint: 549 s.append("\n") 550 s.append(contents) 551 if prettyPrint and contents and contents[-1] != "\n": 552 s.append("\n") 553 if prettyPrint and closeTag: 554 s.append(space) 555 s.append(closeTag) 556 if prettyPrint and closeTag and self.nextSibling: 557 s.append("\n") 558 s = ''.join(s) 559 return s 560 561 def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): 562 return self.__str__(encoding, True) 563 564 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, 565 prettyPrint=False, indentLevel=0): 566 """Renders the contents of this tag as a string in the given 567 encoding. If encoding is None, returns a Unicode string..""" 568 s=[] 569 for c in self: 570 text = None 571 if isinstance(c, NavigableString): 572 text = c.__str__(encoding) 573 elif isinstance(c, Tag): 574 s.append(c.__str__(encoding, prettyPrint, indentLevel)) 575 if text and prettyPrint: 576 text = text.strip() 577 if text: 578 if prettyPrint: 579 s.append(" " * (indentLevel-1)) 580 s.append(text) 581 if prettyPrint: 582 s.append("\n") 583 return ''.join(s) 584 585 #Soup methods 586 587 def find(self, name=None, attrs={}, recursive=True, text=None, 588 **kwargs): 589 """Return only the first child of this 590 Tag matching the given criteria.""" 591 r = None 592 l = self.findAll(name, attrs, recursive, text, 1) 593 if l: 594 r = l[0] 595 return r 596 findChild = find 597 598 def findAll(self, name=None, attrs={}, recursive=True, text=None, 599 limit=None, **kwargs): 600 """Extracts a list of Tag objects that match the given 601 criteria. You can specify the name of the Tag and any 602 attributes you want the Tag to have. 603 604 The value of a key-value pair in the 'attrs' map can be a 605 string, a list of strings, a regular expression object, or a 606 callable that takes a string and returns whether or not the 607 string matches for some custom definition of 'matches'. The 608 same is true of the tag name.""" 609 generator = self.recursiveChildGenerator 610 if not recursive: 611 generator = self.childGenerator 612 return self._findAll(name, attrs, text, limit, generator, **kwargs) 613 findAllChildren = findAll 614 615 #Utility methods 616 617 def append(self, tag): 618 """Appends the given tag to the contents of this tag.""" 619 self.contents.append(tag) 620 621 #Private methods 622 623 def _getAttrMap(self): 624 """Initializes a map representation of this tag's attributes, 625 if not already initialized.""" 626 if not getattr(self, 'attrMap'): 627 self.attrMap = {} 628 for (key, value) in self.attrs: 629 self.attrMap[key] = value 630 return self.attrMap 631 632 #Generator methods 633 def childGenerator(self): 634 for i in range(0, len(self.contents)): 635 yield self.contents[i] 636 raise StopIteration 637 638 def recursiveChildGenerator(self): 639 stack = [(self, 0)] 640 while stack: 641 tag, start = stack.pop() 642 if isinstance(tag, Tag): 643 for i in range(start, len(tag.contents)): 644 a = tag.contents[i] 645 yield a 646 if isinstance(a, Tag) and tag.contents: 647 if i < len(tag.contents) - 1: 648 stack.append((tag, i+1)) 649 stack.append((a, 0)) 650 break 651 raise StopIteration 652 653# Next, a couple classes to represent queries and their results. 654class SoupStrainer: 655 """Encapsulates a number of ways of matching a markup element (tag or 656 text).""" 657 658 def __init__(self, name=None, attrs={}, text=None, **kwargs): 659 self.name=name 660 self.attrs=attrs.copy() 661 self.attrs.update(kwargs) 662 self.text = text 663 664 def __str__(self): 665 if self.text: 666 return self.text 667 else: 668 return "%s|%s" % (self.name, self.attrs) 669 670 def searchTag(self, markupName=None, markupAttrs={}): 671 found = None 672 markup = None 673 if isinstance(markupName, Tag): 674 markup = markupName 675 markupAttrs = markup 676 callFunctionWithTagData = callable(self.name) \ 677 and not isinstance(markupName, Tag) 678 679 if (not self.name) \ 680 or callFunctionWithTagData \ 681 or (markup and self._matches(markup, self.name)) \ 682 or (not markup and self._matches(markupName, self.name)): 683 if callFunctionWithTagData: 684 match = self.name(markupName, markupAttrs) 685 else: 686 match = True 687 markupAttrMap = None 688 for attr, matchAgainst in self.attrs.items(): 689 if not markupAttrMap: 690 if hasattr(markupAttrs, 'get'): 691 markupAttrMap = markupAttrs 692 else: 693 markupAttrMap = {} 694 for k,v in markupAttrs: 695 markupAttrMap[k] = v 696 attrValue = markupAttrMap.get(attr) 697 if not self._matches(attrValue, matchAgainst): 698 match = False 699 break 700 if match: 701 if markup: 702 found = markup 703 else: 704 found = markupName 705 return found 706 707 def search(self, markup): 708 #print 'looking for %s in %s' % (self, markup) 709 found = None 710 # If given a list of items, scan it for a text element that 711 # matches. 712 if isList(markup) and not isinstance(markup, Tag): 713 for element in markup: 714 if isinstance(element, NavigableString) \ 715 and self.search(element): 716 found = element 717 break 718 # If it's a Tag, make sure its name or attributes match. 719 # Don't bother with Tags if we're searching for text. 720 elif isinstance(markup, Tag): 721 if not self.text: 722 found = self.searchTag(markup) 723 # If it's text, make sure the text matches. 724 elif isinstance(markup, NavigableString) or \ 725 isString(markup): 726 if self._matches(markup, self.text): 727 found = markup 728 else: 729 raise Exception, "I don't know how to match against a %s" \ 730 % markup.__class__ 731 return found 732 733 def _matches(self, markup, matchAgainst): 734 #print "Matching %s against %s" % (markup, matchAgainst) 735 result = False 736 if matchAgainst == True and type(matchAgainst) == types.BooleanType: 737 result = markup != None 738 elif callable(matchAgainst): 739 result = matchAgainst(markup) 740 else: 741 #Custom match methods take the tag as an argument, but all 742 #other ways of matching match the tag name as a string. 743 if isinstance(markup, Tag): 744 markup = markup.name 745 if markup and not isString(markup): 746 markup = unicode(markup) 747 #Now we know that chunk is either a string, or None. 748 if hasattr(matchAgainst, 'match'): 749 # It's a regexp object. 750 result = markup and matchAgainst.search(markup) 751 elif isList(matchAgainst): 752 result = markup in matchAgainst 753 elif hasattr(matchAgainst, 'items'): 754 result = markup.has_key(matchAgainst) 755 elif matchAgainst and isString(markup): 756 if isinstance(markup, unicode): 757 matchAgainst = unicode(matchAgainst) 758 else: 759 matchAgainst = str(matchAgainst) 760 761 if not result: 762 result = matchAgainst == markup 763 return result 764 765class ResultSet(list): 766 """A ResultSet is just a list that keeps track of the SoupStrainer 767 that created it.""" 768 def __init__(self, source): 769 list.__init__([]) 770 self.source = source 771 772# Now, some helper functions. 773 774def isList(l): 775 """Convenience method that works with all 2.x versions of Python 776 to determine whether or not something is listlike.""" 777 return hasattr(l, '__iter__') \ 778 or (type(l) in (types.ListType, types.TupleType)) 779 780def isString(s): 781 """Convenience method that works with all 2.x versions of Python 782 to determine whether or not something is stringlike.""" 783 try: 784 return isinstance(s, unicode) or isintance(s, basestring) 785 except NameError: 786 return isinstance(s, str) 787 788def buildTagMap(default, *args): 789 """Turns a list of maps, lists, or scalars into a single map. 790 Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and 791 NESTING_RESET_TAGS maps out of lists and partial maps.""" 792 built = {} 793 for portion in args: 794 if hasattr(portion, 'items'): 795 #It's a map. Merge it. 796 for k,v in portion.items(): 797 built[k] = v 798 elif isList(portion): 799 #It's a list. Map each item to the default. 800 for k in portion: 801 built[k] = default 802 else: 803 #It's a scalar. Map it to the default. 804 built[portion] = default 805 return built 806 807# Now, the parser classes. 808 809class BeautifulStoneSoup(Tag, SGMLParser): 810 811 """This class contains the basic parser and search code. It defines 812 a parser that knows nothing about tag behavior except for the 813 following: 814 815 You can't close a tag without closing all the tags it encloses. 816 That is, "<foo><bar></foo>" actually means 817 "<foo><bar></bar></foo>". 818 819 [Another possible explanation is "<foo><bar /></foo>", but since 820 this class defines no SELF_CLOSING_TAGS, it will never use that 821 explanation.] 822 823 This class is useful for parsing XML or made-up markup languages, 824 or when BeautifulSoup makes an assumption counter to what you were 825 expecting.""" 826 827 XML_ENTITY_LIST = {} 828 for i in ["quot", "apos", "amp", "lt", "gt"]: 829 XML_ENTITY_LIST[i] = True 830 831 SELF_CLOSING_TAGS = {} 832 NESTABLE_TAGS = {} 833 RESET_NESTING_TAGS = {} 834 QUOTE_TAGS = {} 835 836 MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), 837 lambda x: x.group(1) + ' />'), 838 (re.compile('<!\s+([^<>]*)>'), 839 lambda x: '<!' + x.group(1) + '>') 840 ] 841 842 ROOT_TAG_NAME = u'[document]' 843 844 HTML_ENTITIES = "html" 845 XML_ENTITIES = "xml" 846 847 def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, 848 markupMassage=True, smartQuotesTo=XML_ENTITIES, 849 convertEntities=None, selfClosingTags=None): 850 """The Soup object is initialized as the 'root tag', and the 851 provided markup (which can be a string or a file-like object) 852 is fed into the underlying parser. 853 854 sgmllib will process most bad HTML, and the BeautifulSoup 855 class has some tricks for dealing with some HTML that kills 856 sgmllib, but Beautiful Soup can nonetheless choke or lose data 857 if your data uses self-closing tags or declarations 858 incorrectly. 859 860 By default, Beautiful Soup uses regexes to sanitize input, 861 avoiding the vast majority of these problems. If the problems 862 don't apply to you, pass in False for markupMassage, and 863 you'll get better performance. 864 865 The default parser massage techniques fix the two most common 866 instances of invalid HTML that choke sgmllib: 867 868 <br/> (No space between name of closing tag and tag close) 869 <! --Comment--> (Extraneous whitespace in declaration) 870 871 You can pass in a custom list of (RE object, replace method) 872 tuples to get Beautiful Soup to scrub your input the way you 873 want.""" 874 875 self.parseOnlyThese = parseOnlyThese 876 self.fromEncoding = fromEncoding 877 self.smartQuotesTo = smartQuotesTo 878 self.convertEntities = convertEntities 879 if self.convertEntities: 880 # It doesn't make sense to convert encoded characters to 881 # entities even while you're converting entities to Unicode. 882 # Just convert it all to Unicode. 883 self.smartQuotesTo = None 884 self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) 885 SGMLParser.__init__(self) 886 887 if hasattr(markup, 'read'): # It's a file-type object. 888 markup = markup.read() 889 self.markup = markup 890 self.markupMassage = markupMassage 891 try: 892 self._feed() 893 except StopParsing: 894 pass 895 self.markup = None # The markup can now be GCed 896 897 def _feed(self, inDocumentEncoding=None): 898 # Convert the document to Unicode. 899 markup = self.markup 900 if isinstance(markup, unicode): 901 if not hasattr(self, 'originalEncoding'): 902 self.originalEncoding = None 903 else: 904 dammit = UnicodeDammit\ 905 (markup, [self.fromEncoding, inDocumentEncoding], 906 smartQuotesTo=self.smartQuotesTo) 907 markup = dammit.unicode 908 self.originalEncoding = dammit.originalEncoding 909 if markup: 910 if self.markupMassage: 911 if not isList(self.markupMassage): 912 self.markupMassage = self.MARKUP_MASSAGE 913 for fix, m in self.markupMassage: 914 markup = fix.sub(m, markup) 915 self.reset() 916 917 SGMLParser.feed(self, markup) 918 # Close out any unfinished strings and close all the open tags. 919 self.endData() 920 while self.currentTag.name != self.ROOT_TAG_NAME: 921 self.popTag() 922 923 def __getattr__(self, methodName): 924 """This method routes method call requests to either the SGMLParser 925 superclass or the Tag superclass, depending on the method name.""" 926 #print "__getattr__ called on %s.%s" % (self.__class__, methodName) 927 928 if methodName.find('start_') == 0 or methodName.find('end_') == 0 \ 929 or methodName.find('do_') == 0: 930 return SGMLParser.__getattr__(self, methodName) 931 elif methodName.find('__'): 932 return Tag.__getattr__(self, methodName) 933 else: 934 raise AttributeError 935 936 def isSelfClosingTag(self, name): 937 """Returns true iff the given string is the name of a 938 self-closing tag according to this parser.""" 939 return self.SELF_CLOSING_TAGS.has_key(name) \ 940 or self.instanceSelfClosingTags.has_key(name) 941 942 def reset(self): 943 Tag.__init__(self, self, self.ROOT_TAG_NAME) 944 self.hidden = 1 945 SGMLParser.reset(self) 946 self.currentData = [] 947 self.currentTag = None 948 self.tagStack = [] 949 self.quoteStack = [] 950 self.pushTag(self) 951 952 def popTag(self): 953 tag = self.tagStack.pop() 954 # Tags with just one string-owning child get the child as a 955 # 'string' property, so that soup.tag.string is shorthand for 956 # soup.tag.contents[0] 957 if len(self.currentTag.contents) == 1 and \ 958 isinstance(self.currentTag.contents[0], NavigableString): 959 self.currentTag.string = self.currentTag.contents[0] 960 961 #print "Pop", tag.name 962 if self.tagStack: 963 self.currentTag = self.tagStack[-1] 964 return self.currentTag 965 966 def pushTag(self, tag): 967 #print "Push", tag.name 968 if self.currentTag: 969 self.currentTag.append(tag) 970 self.tagStack.append(tag) 971 self.currentTag = self.tagStack[-1] 972 973 def endData(self, containerClass=NavigableString): 974 if self.currentData: 975 currentData = ''.join(self.currentData) 976 if not currentData.strip(): 977 if '\n' in currentData: 978 currentData = '\n' 979 else: 980 currentData = ' ' 981 self.currentData = [] 982 if self.parseOnlyThese and len(self.tagStack) <= 1 and \ 983 (not self.parseOnlyThese.text or 984 not self.parseOnlyThese.search(currentData)): 985 return 986 o = containerClass(currentData) 987 o.setup(self.currentTag, self.previous) 988 if self.previous: 989 self.previous.next = o 990 self.previous = o 991 self.currentTag.contents.append(o) 992 993 994 def _popToTag(self, name, inclusivePop=True): 995 """Pops the tag stack up to and including the most recent 996 instance of the given tag. If inclusivePop is false, pops the tag 997 stack up to but *not* including the most recent instqance of 998 the given tag.""" 999 #print "Popping to %s" % name 1000 if name == self.ROOT_TAG_NAME: 1001 return 1002 1003 numPops = 0 1004 mostRecentTag = None 1005 for i in range(len(self.tagStack)-1, 0, -1): 1006 if name == self.tagStack[i].name: 1007 numPops = len(self.tagStack)-i 1008 break 1009 if not inclusivePop: 1010 numPops = numPops - 1 1011 1012 for i in range(0, numPops): 1013 mostRecentTag = self.popTag() 1014 return mostRecentTag 1015 1016 def _smartPop(self, name): 1017 1018 """We need to pop up to the previous tag of this type, unless 1019 one of this tag's nesting reset triggers comes between this 1020 tag and the previous tag of this type, OR unless this tag is a 1021 generic nesting trigger and another generic nesting trigger 1022 comes between this tag and the previous tag of this type. 1023 1024 Examples: 1025 <p>Foo<b>Bar<p> should pop to 'p', not 'b'. 1026 <p>Foo<table>Bar<p> should pop to 'table', not 'p'. 1027 <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'. 1028 <p>Foo<b>Bar<p> should pop to 'p', not 'b'. 1029 1030 <li><ul><li> *<li>* should pop to 'ul', not the first 'li'. 1031 <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' 1032 <td><tr><td> *<td>* should pop to 'tr', not the first 'td' 1033 """ 1034 1035 nestingResetTriggers = self.NESTABLE_TAGS.get(name) 1036 isNestable = nestingResetTriggers != None 1037 isResetNesting = self.RESET_NESTING_TAGS.has_key(name) 1038 popTo = None 1039 inclusive = True 1040 for i in range(len(self.tagStack)-1, 0, -1): 1041 p = self.tagStack[i] 1042 if (not p or p.name == name) and not isNestable: 1043 #Non-nestable tags get popped to the top or to their 1044 #last occurance. 1045 popTo = name 1046 break 1047 if (nestingResetTriggers != None 1048 and p.name in nestingResetTriggers) \ 1049 or (nestingResetTriggers == None and isResetNesting 1050 and self.RESET_NESTING_TAGS.has_key(p.name)): 1051 1052 #If we encounter one of the nesting reset triggers 1053 #peculiar to this tag, or we encounter another tag 1054 #that causes nesting to reset, pop up to but not 1055 #including that tag. 1056 popTo = p.name 1057 inclusive = False 1058 break 1059 p = p.parent 1060 if popTo: 1061 self._popToTag(popTo, inclusive) 1062 1063 def unknown_starttag(self, name, attrs, selfClosing=0): 1064 #print "Start tag %s" % name 1065 if self.quoteStack: 1066 #This is not a real tag. 1067 #print "<%s> is not real!" % name 1068 attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) 1069 self.handle_data('<%s%s>' % (name, attrs)) 1070 return 1071 self.endData() 1072 1073 if not self.isSelfClosingTag(name) and not selfClosing: 1074 self._smartPop(name) 1075 1076 if self.parseOnlyThese and len(self.tagStack) <= 1 \ 1077 and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): 1078 return 1079 1080 tag = Tag(self, name, attrs, self.currentTag, self.previous) 1081 if self.previous: 1082 self.previous.next = tag 1083 self.previous = tag 1084 self.pushTag(tag) 1085 if selfClosing or self.isSelfClosingTag(name): 1086 self.popTag() 1087 if name in self.QUOTE_TAGS: 1088 #print "Beginning quote (%s)" % name 1089 self.quoteStack.append(name) 1090 self.literal = 1 1091 return tag 1092 1093 def unknown_endtag(self, name): 1094 #print "End tag %s" % name 1095 if self.quoteStack and self.quoteStack[-1] != name: 1096 #This is not a real end tag. 1097 #print "</%s> is not real!" % name 1098 self.handle_data('</%s>' % name) 1099 return 1100 self.endData() 1101 self._popToTag(name) 1102 if self.quoteStack and self.quoteStack[-1] == name: 1103 self.quoteStack.pop() 1104 self.literal = (len(self.quoteStack) > 0) 1105 1106 def handle_data(self, data): 1107 self.currentData.append(data) 1108 1109 def _toStringSubclass(self, text, subclass): 1110 """Adds a certain piece of text to the tree as a NavigableString 1111 subclass.""" 1112 self.endData() 1113 self.handle_data(text) 1114 self.endData(subclass) 1115 1116 def handle_pi(self, text): 1117 """Handle a processing instruction as a ProcessingInstruction 1118 object, possibly one with a %SOUP-ENCODING% slot into which an 1119 encoding will be plugged later.""" 1120 if text[:3] == "xml": 1121 text = "xml version='1.0' encoding='%SOUP-ENCODING%'" 1122 self._toStringSubclass(text, ProcessingInstruction) 1123 1124 def handle_comment(self, text): 1125 "Handle comments as Comment objects." 1126 self._toStringSubclass(text, Comment) 1127 1128 def handle_charref(self, ref): 1129 "Handle character references as data." 1130 if self.convertEntities in [self.HTML_ENTITIES, 1131 self.XML_ENTITIES]: 1132 data = unichr(int(ref)) 1133 else: 1134 data = '&#%s;' % ref 1135 self.handle_data(data) 1136 1137 def handle_entityref(self, ref): 1138 """Handle entity references as data, possibly converting known 1139 HTML entity references to the corresponding Unicode 1140 characters.""" 1141 data = None 1142 if self.convertEntities == self.HTML_ENTITIES or \ 1143 (self.convertEntities == self.XML_ENTITIES and 1144 self.XML_ENTITY_LIST.get(ref)): 1145 try: 1146 data = unichr(name2codepoint[ref]) 1147 except KeyError: 1148 pass 1149 if not data: 1150 data = '&%s;' % ref 1151 self.handle_data(data) 1152 1153 def handle_decl(self, data): 1154 "Handle DOCTYPEs and the like as Declaration objects." 1155 self._toStringSubclass(data, Declaration) 1156 1157 def parse_declaration(self, i): 1158 """Treat a bogus SGML declaration as raw data. Treat a CDATA 1159 declaration as a CData object.""" 1160 j = None 1161 if self.rawdata[i:i+9] == '<![CDATA[': 1162 k = self.rawdata.find(']]>', i) 1163 if k == -1: 1164 k = len(self.rawdata) 1165 data = self.rawdata[i+9:k] 1166 j = k+3 1167 self._toStringSubclass(data, CData) 1168 else: 1169 try: 1170 j = SGMLParser.parse_declaration(self, i) 1171 except SGMLParseError: 1172 toHandle = self.rawdata[i:] 1173 self.handle_data(toHandle) 1174 j = i + len(toHandle) 1175 return j 1176 1177class BeautifulSoup(BeautifulStoneSoup): 1178 1179 """This parser knows the following facts about HTML: 1180 1181 * Some tags have no closing tag and should be interpreted as being 1182 closed as soon as they are encountered. 1183 1184 * The text inside some tags (ie. 'script') may contain tags which 1185 are not really part of the document and which should be parsed 1186 as text, not tags. If you want to parse the text as tags, you can 1187 always fetch it and parse it explicitly. 1188 1189 * Tag nesting rules: 1190 1191 Most tags can't be nested at all. For instance, the occurance of 1192 a <p> tag should implicitly close the previous <p> tag. 1193 1194 <p>Para1<p>Para2 1195 should be transformed into: 1196 <p>Para1</p><p>Para2 1197 1198 Some tags can be nested arbitrarily. For instance, the occurance 1199 of a <blockquote> tag should _not_ implicitly close the previous 1200 <blockquote> tag. 1201 1202 Alice said: <blockquote>Bob said: <blockquote>Blah 1203 should NOT be transformed into: 1204 Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah 1205 1206 Some tags can be nested, but the nesting is reset by the 1207 interposition of other tags. For instance, a <tr> tag should 1208 implicitly close the previous <tr> tag within the same <table>, 1209 but not close a <tr> tag in another table. 1210 1211 <table><tr>Blah<tr>Blah 1212 should be transformed into: 1213 <table><tr>Blah</tr><tr>Blah 1214 but, 1215 <tr>Blah<table><tr>Blah 1216 should NOT be transformed into 1217 <tr>Blah<table></tr><tr>Blah 1218 1219 Differing assumptions about tag nesting rules are a major source 1220 of problems with the BeautifulSoup class. If BeautifulSoup is not 1221 treating as nestable a tag your page author treats as nestable, 1222 try ICantBelieveItsBeautifulSoup, MinimalSoup, or 1223 BeautifulStoneSoup before writing your own subclass.""" 1224 1225 def __init__(self, *args, **kwargs): 1226 if not kwargs.has_key('smartQuotesTo'): 1227 kwargs['smartQuotesTo'] = self.HTML_ENTITIES 1228 BeautifulStoneSoup.__init__(self, *args, **kwargs) 1229 1230 SELF_CLOSING_TAGS = buildTagMap(None, 1231 ['br' , 'hr', 'input', 'img', 'meta', 1232 'spacer', 'link', 'frame', 'base']) 1233 1234 QUOTE_TAGS = {'script': None} 1235 1236 #According to the HTML standard, each of these inline tags can 1237 #contain another tag of the same type. Furthermore, it's common 1238 #to actually use these tags this way. 1239 NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 1240 'center'] 1241 1242 #According to the HTML standard, these block tags can contain 1243 #another tag of the same type. Furthermore, it's common 1244 #to actually use these tags this way. 1245 NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del'] 1246 1247 #Lists can contain other lists, but there are restrictions. 1248 NESTABLE_LIST_TAGS = { 'ol' : [], 1249 'ul' : [], 1250 'li' : ['ul', 'ol'], 1251 'dl' : [], 1252 'dd' : ['dl'], 1253 'dt' : ['dl'] } 1254 1255 #Tables can contain other tables, but there are restrictions. 1256 NESTABLE_TABLE_TAGS = {'table' : [], 1257 'tr' : ['table', 'tbody', 'tfoot', 'thead'], 1258 'td' : ['tr'], 1259 'th' : ['tr'], 1260 'thead' : ['table'], 1261 'tbody' : ['table'], 1262 'tfoot' : ['table'], 1263 } 1264 1265 NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre'] 1266 1267 #If one of these tags is encountered, all tags up to the next tag of 1268 #this type are popped. 1269 RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', 1270 NON_NESTABLE_BLOCK_TAGS, 1271 NESTABLE_LIST_TAGS, 1272 NESTABLE_TABLE_TAGS) 1273 1274 NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, 1275 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) 1276 1277 # Used to detect the charset in a META tag; see start_meta 1278 CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)") 1279 1280 def start_meta(self, attrs): 1281 """Beautiful Soup can detect a charset included in a META tag, 1282 try to convert the document to that charset, and re-parse the 1283 document from the beginning.""" 1284 httpEquiv = None 1285 contentType = None 1286 contentTypeIndex = None 1287 tagNeedsEncodingSubstitution = False 1288 1289 for i in range(0, len(attrs)): 1290 key, value = attrs[i] 1291 key = key.lower() 1292 if key == 'http-equiv': 1293 httpEquiv = value 1294 elif key == 'content': 1295 contentType = value 1296 contentTypeIndex = i 1297 1298 if httpEquiv and contentType: # It's an interesting meta tag. 1299 match = self.CHARSET_RE.search(contentType) 1300 if match: 1301 if getattr(self, 'declaredHTMLEncoding') or \ 1302 (self.originalEncoding == self.fromEncoding): 1303 # This is our second pass through the document, or 1304 # else an encoding was specified explicitly and it 1305 # worked. Rewrite the meta tag. 1306 newAttr = self.CHARSET_RE.sub\ 1307 (lambda(match):match.group(1) + 1308 "%SOUP-ENCODING%", value) 1309 attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], 1310 newAttr) 1311 tagNeedsEncodingSubstitution = True 1312 else: 1313 # This is our first pass through the document. 1314 # Go through it again with the new information. 1315 newCharset = match.group(3) 1316 if newCharset and newCharset != self.originalEncoding: 1317 self.declaredHTMLEncoding = newCharset 1318 self._feed(self.declaredHTMLEncoding) 1319 raise StopParsing 1320 tag = self.unknown_starttag("meta", attrs) 1321 if tagNeedsEncodingSubstitution: 1322 tag.containsSubstitutions = True 1323 1324class StopParsing(Exception): 1325 pass 1326 1327class ICantBelieveItsBeautifulSoup(BeautifulSoup): 1328 1329 """The BeautifulSoup class is oriented towards skipping over 1330 common HTML errors like unclosed tags. However, sometimes it makes 1331 errors of its own. For instance, consider this fragment: 1332 1333 <b>Foo<b>Bar</b></b> 1334 1335 This is perfectly valid (if bizarre) HTML. However, the 1336 BeautifulSoup class will implicitly close the first b tag when it 1337 encounters the second 'b'. It will think the author wrote 1338 "<b>Foo<b>Bar", and didn't close the first 'b' tag, because 1339 there's no real-world reason to bold something that's already 1340 bold. When it encounters '</b></b>' it will close two more 'b' 1341 tags, for a grand total of three tags closed instead of two. This 1342 can throw off the rest of your document structure. The same is 1343 true of a number of other tags, listed below. 1344 1345 It's much more common for someone to forget to close a 'b' tag 1346 than to actually use nested 'b' tags, and the BeautifulSoup class 1347 handles the common case. This class handles the not-co-common 1348 case: where you can't believe someone wrote what they did, but 1349 it's valid HTML and BeautifulSoup screwed up by assuming it 1350 wouldn't be.""" 1351 1352 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ 1353 ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', 1354 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', 1355 'big'] 1356 1357 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript'] 1358 1359 NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, 1360 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, 1361 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) 1362 1363class MinimalSoup(BeautifulSoup): 1364 """The MinimalSoup class is for parsing HTML that contains 1365 pathologically bad markup. It makes no assumptions about tag 1366 nesting, but it does know which tags are self-closing, that 1367 <script> tags contain Javascript and should not be parsed, that 1368 META tags may contain encoding information, and so on. 1369 1370 This also makes it better for subclassing than BeautifulStoneSoup 1371 or BeautifulSoup.""" 1372 1373 RESET_NESTING_TAGS = buildTagMap('noscript') 1374 NESTABLE_TAGS = {} 1375 1376class BeautifulSOAP(BeautifulStoneSoup): 1377 """This class will push a tag with only a single string child into 1378 the tag's parent as an attribute. The attribute's name is the tag 1379 name, and the value is the string child. An example should give 1380 the flavor of the change: 1381 1382 <foo><bar>baz</bar></foo> 1383 => 1384 <foo bar="baz"><bar>baz</bar></foo> 1385 1386 You can then access fooTag['bar'] instead of fooTag.barTag.string. 1387 1388 This is, of course, useful for scraping structures that tend to 1389 use subelements instead of attributes, such as SOAP messages. Note 1390 that it modifies its input, so don't print the modified version 1391 out. 1392 1393 I'm not sure how many people really want to use this class; let me 1394 know if you do. Mainly I like the name.""" 1395 1396 def popTag(self): 1397 if len(self.tagStack) > 1: 1398 tag = self.tagStack[-1] 1399 parent = self.tagStack[-2] 1400 parent._getAttrMap() 1401 if (isinstance(tag, Tag) and len(tag.contents) == 1 and 1402 isinstance(tag.contents[0], NavigableString) and 1403 not parent.attrMap.has_key(tag.name)): 1404 parent[tag.name] = tag.contents[0] 1405 BeautifulStoneSoup.popTag(self) 1406 1407#Enterprise class names! It has come to our attention that some people 1408#think the names of the Beautiful Soup parser classes are too silly 1409#and "unprofessional" for use in enterprise screen-scraping. We feel 1410#your pain! For such-minded folk, the Beautiful Soup Consortium And 1411#All-Night Kosher Bakery recommends renaming this file to 1412#"RobustParser.py" (or, in cases of extreme enterprisitude, 1413#"RobustParserBeanInterface.class") and using the following 1414#enterprise-friendly class aliases: 1415class RobustXMLParser(BeautifulStoneSoup): 1416 pass 1417class RobustHTMLParser(BeautifulSoup): 1418 pass 1419class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup): 1420 pass 1421class RobustInsanelyWackAssHTMLParser(MinimalSoup): 1422 pass 1423class SimplifyingSOAPParser(BeautifulSOAP): 1424 pass 1425 1426###################################################### 1427# 1428# Bonus library: Unicode, Dammit 1429# 1430# This class forces XML data into a standard format (usually to UTF-8 1431# or Unicode). It is heavily based on code from Mark Pilgrim's 1432# Universal Feed Parser. It does not rewrite the XML or HTML to 1433# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi 1434# (XML) and BeautifulSoup.start_meta (HTML). 1435 1436# Autodetects character encodings. 1437# Download from http://chardet.feedparser.org/ 1438try: 1439 import chardet 1440# import chardet.constants 1441# chardet.constants._debug = 1 1442except: 1443 chardet = None 1444chardet = None 1445 1446# cjkcodecs and iconv_codec make Python know about more character encodings. 1447# Both are available from http://cjkpython.i18n.org/ 1448# They're built in if you use Python 2.4. 1449try: 1450 import cjkcodecs.aliases 1451except: 1452 pass 1453try: 1454 import iconv_codec 1455except: 1456 pass 1457 1458class UnicodeDammit: 1459 """A class for detecting the encoding of a *ML document and 1460 converting it to a Unicode string. If the source encoding is 1461 windows-1252, can replace MS smart quotes with their HTML or XML 1462 equivalents.""" 1463 1464 # This dictionary maps commonly seen values for "charset" in HTML 1465 # meta tags to the corresponding Python codec names. It only covers 1466 # values that aren't in Python's aliases and can't be determined 1467 # by the heuristics in find_codec. 1468 CHARSET_ALIASES = { "macintosh" : "mac-roman", 1469 "x-sjis" : "shift-jis" } 1470 1471 def __init__(self, markup, overrideEncodings=[], 1472 smartQuotesTo='xml'): 1473 self.markup, documentEncoding, sniffedEncoding = \ 1474 self._detectEncoding(markup) 1475 self.smartQuotesTo = smartQuotesTo 1476 self.triedEncodings = [] 1477 if isinstance(markup, unicode): 1478 return markup 1479 1480 u = None 1481 for proposedEncoding in overrideEncodings: 1482 u = self._convertFrom(proposedEncoding) 1483 if u: break 1484 if not u: 1485 for proposedEncoding in (documentEncoding, sniffedEncoding): 1486 u = self._convertFrom(proposedEncoding) 1487 if u: break 1488 1489 # If no luck and we have auto-detection library, try that: 1490 if not u and chardet and not isinstance(self.markup, unicode): 1491 u = self._convertFrom(chardet.detect(self.markup)['encoding']) 1492 1493 # As a last resort, try utf-8 and windows-1252: 1494 if not u: 1495 for proposed_encoding in ("utf-8", "windows-1252"): 1496 u = self._convertFrom(proposed_encoding) 1497 if u: break 1498 self.unicode = u 1499 if not u: self.originalEncoding = None 1500 1501 def _subMSChar(self, orig): 1502 """Changes a MS smart quote character to an XML or HTML 1503 entity.""" 1504 sub = self.MS_CHARS.get(orig) 1505 if type(sub) == types.TupleType: 1506 if self.smartQuotesTo == 'xml': 1507 sub = '&#x%s;' % sub[1] 1508 else: 1509 sub = '&%s;' % sub[0] 1510 return sub 1511 1512 def _convertFrom(self, proposed): 1513 proposed = self.find_codec(proposed) 1514 if not proposed or proposed in self.triedEncodings: 1515 return None 1516 self.triedEncodings.append(proposed) 1517 markup = self.markup 1518 1519 # Convert smart quotes to HTML if coming from an encoding 1520 # that might have them. 1521 if self.smartQuotesTo and proposed in("windows-1252", 1522 "ISO-8859-1", 1523 "ISO-8859-2"): 1524 markup = re.compile("([\x80-\x9f])").sub \ 1525 (lambda(x): self._subMSChar(x.group(1)), 1526 markup) 1527 1528 try: 1529 # print "Trying to convert document to %s" % proposed 1530 u = self._toUnicode(markup, proposed) 1531 self.markup = u 1532 self.originalEncoding = proposed 1533 except Exception, e: 1534 # print "That didn't work!" 1535 # print e 1536 return None 1537 #print "Correct encoding: %s" % proposed 1538 return self.markup 1539 1540 def _toUnicode(self, data, encoding): 1541 '''Given a string and its encoding, decodes the string into Unicode. 1542 %encoding is a string recognized by encodings.aliases''' 1543 1544 # strip Byte Order Mark (if present) 1545 if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ 1546 and (data[2:4] != '\x00\x00'): 1547 encoding = 'utf-16be' 1548 data = data[2:] 1549 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ 1550 and (data[2:4] != '\x00\x00'): 1551 encoding = 'utf-16le' 1552 data = data[2:] 1553 elif data[:3] == '\xef\xbb\xbf': 1554 encoding = 'utf-8' 1555 data = data[3:] 1556 elif data[:4] == '\x00\x00\xfe\xff': 1557 encoding = 'utf-32be' 1558 data = data[4:] 1559 elif data[:4] == '\xff\xfe\x00\x00': 1560 encoding = 'utf-32le' 1561 data = data[4:] 1562 newdata = unicode(data, encoding) 1563 return newdata 1564 1565 def _detectEncoding(self, xml_data): 1566 """Given a document, tries to detect its XML encoding.""" 1567 xml_encoding = sniffed_xml_encoding = None 1568 try: 1569 if xml_data[:4] == '\x4c\x6f\xa7\x94': 1570 # EBCDIC 1571 xml_data = self._ebcdic_to_ascii(xml_data) 1572 elif xml_data[:4] == '\x00\x3c\x00\x3f': 1573 # UTF-16BE 1574 sniffed_xml_encoding = 'utf-16be' 1575 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') 1576 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ 1577 and (xml_data[2:4] != '\x00\x00'): 1578 # UTF-16BE with BOM 1579 sniffed_xml_encoding = 'utf-16be' 1580 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') 1581 elif xml_data[:4] == '\x3c\x00\x3f\x00': 1582 # UTF-16LE 1583 sniffed_xml_encoding = 'utf-16le' 1584 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') 1585 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ 1586 (xml_data[2:4] != '\x00\x00'): 1587 # UTF-16LE with BOM 1588 sniffed_xml_encoding = 'utf-16le' 1589 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') 1590 elif xml_data[:4] == '\x00\x00\x00\x3c': 1591 # UTF-32BE 1592 sniffed_xml_encoding = 'utf-32be' 1593 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') 1594 elif xml_data[:4] == '\x3c\x00\x00\x00': 1595 # UTF-32LE 1596 sniffed_xml_encoding = 'utf-32le' 1597 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') 1598 elif xml_data[:4] == '\x00\x00\xfe\xff': 1599 # UTF-32BE with BOM 1600 sniffed_xml_encoding = 'utf-32be' 1601 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') 1602 elif xml_data[:4] == '\xff\xfe\x00\x00': 1603 # UTF-32LE with BOM 1604 sniffed_xml_encoding = 'utf-32le' 1605 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') 1606 elif xml_data[:3] == '\xef\xbb\xbf': 1607 # UTF-8 with BOM 1608 sniffed_xml_encoding = 'utf-8' 1609 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') 1610 else: 1611 sniffed_xml_encoding = 'ascii' 1612 pass 1613 xml_encoding_match = re.compile \ 1614 ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\ 1615 .match(xml_data) 1616 except: 1617 xml_encoding_match = None 1618 if xml_encoding_match: 1619 xml_encoding = xml_encoding_match.groups()[0].lower() 1620 if sniffed_xml_encoding and \ 1621 (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 1622 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 1623 'utf-16', 'utf-32', 'utf_16', 'utf_32', 1624 'utf16', 'u16')): 1625 xml_encoding = sniffed_xml_encoding 1626 return xml_data, xml_encoding, sniffed_xml_encoding 1627 1628 1629 def find_codec(self, charset): 1630 return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ 1631 or (charset and self._codec(charset.replace("-", ""))) \ 1632 or (charset and self._codec(charset.replace("-", "_"))) \ 1633 or charset 1634 1635 def _codec(self, charset): 1636 if not charset: return charset 1637 codec = None 1638 try: 1639 codecs.lookup(charset) 1640 codec = charset 1641 except LookupError: 1642 pass 1643 return codec 1644 1645 EBCDIC_TO_ASCII_MAP = None 1646 def _ebcdic_to_ascii(self, s): 1647 c = self.__class__ 1648 if not c.EBCDIC_TO_ASCII_MAP: 1649 emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, 1650 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, 1651 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, 1652 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, 1653 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, 1654 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, 1655 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, 1656 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, 1657 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, 1658 201,202,106,107,108,109,110,111,112,113,114,203,204,205, 1659 206,207,208,209,126,115,116,117,118,119,120,121,122,210, 1660 211,212,213,214,215,216,217,218,219,220,221,222,223,224, 1661 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, 1662 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, 1663 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, 1664 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, 1665 250,251,252,253,254,255) 1666 import string 1667 c.EBCDIC_TO_ASCII_MAP = string.maketrans( 1668 ''.join(map(chr, range(256))), ''.join(map(chr, emap))) 1669 return s.translate(c.EBCDIC_TO_ASCII_MAP) 1670 1671 MS_CHARS = { '\x80' : ('euro', '20AC'), 1672 '\x81' : ' ', 1673 '\x82' : ('sbquo', '201A'), 1674 '\x83' : ('fnof', '192'), 1675 '\x84' : ('bdquo', '201E'), 1676 '\x85' : ('hellip', '2026'), 1677 '\x86' : ('dagger', '2020'), 1678 '\x87' : ('Dagger', '2021'), 1679 '\x88' : ('circ', '2C6'), 1680 '\x89' : ('permil', '2030'), 1681 '\x8A' : ('Scaron', '160'), 1682 '\x8B' : ('lsaquo', '2039'), 1683 '\x8C' : ('OElig', '152'), 1684 '\x8D' : '?', 1685 '\x8E' : ('#x17D', '17D'), 1686 '\x8F' : '?', 1687 '\x90' : '?', 1688 '\x91' : ('lsquo', '2018'), 1689 '\x92' : ('rsquo', '2019'), 1690 '\x93' : ('ldquo', '201C'), 1691 '\x94' : ('rdquo', '201D'), 1692 '\x95' : ('bull', '2022'), 1693 '\x96' : ('ndash', '2013'), 1694 '\x97' : ('mdash', '2014'), 1695 '\x98' : ('tilde', '2DC'), 1696 '\x99' : ('trade', '2122'), 1697 '\x9a' : ('scaron', '161'), 1698 '\x9b' : ('rsaquo', '203A'), 1699 '\x9c' : ('oelig', '153'), 1700 '\x9d' : '?', 1701 '\x9e' : ('#x17E', '17E'), 1702 '\x9f' : ('Yuml', ''),} 1703 1704####################################################################### 1705 1706 1707#By default, act as an HTML pretty-printer. 1708if __name__ == '__main__': 1709 import sys 1710 soup = BeautifulStoneSoup(sys.stdin.read()) 1711 print soup.prettify() 1712