1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
|
# Copyright 2002 by Katharine Lindner. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Code for more fancy file handles.
Classes:
SGMLExtractorHandle File object that strips tags and returns content from specified
tags blocks.
SGMLExtractor Object that scans for specified SGML tag pairs, removes any inner tags
and returns the raw content.
For example the object SGMLExtractor( [ 'h1' ] )on the following html file would return
'House that Jack built'
SGMLExtractor( [ 'dt' ] ) would return 'ratcatdogcowmaiden'
SGMLExtractor( [ 'dt', 'dd' ] ) would return 'rat that ate the malttcat ate the rat' etc
<h1>House that Jack Built</h1>
<dl>
<dt><big>rat</big></dt>
<dd><big>ate the malt</big></dd>
<dt><big>cat</big></dt>
<dd><big>that ate the rat</big></dd>
<dt><big>dog</big></dt>
<dd><big>that worried the dats</big></dd>
<dt><big>cow</big></dt>
<dd><big>with crumpled horn</big></dd>
<dt><big>maiden</big></dt>
<dd><big>all forlorns</big></dd>
</dl>
"""
import os
import string
import StringIO
import sgmllib
class SGMLExtractorHandle:
"""A Python handle that automatically strips SGML tags and returns data from
specified tag start and end pairs.
"""
def __init__(self, handle, tags_of_interest = [] ):
"""SGMLExtractor(handle, tags_of_interest )
handle is a file handle to SGML-formatted data.
tags_of_interest is a list of root names for pairs of start and end tags
"""
self._handle = handle
self._stripper = SGMLExtractor( tags_of_interest )
def read(self, *args, **keywds):
data = self._handle.read( *args, **keywds)
return self._stripper.strip(data)
def readline(self, *args, **keywds):
line = self._handle.readline( *args, **keywds)
return self._stripper.strip(line)
def readlines(self, *args, **keywds):
lines = self._handle.readlines( *args, **keywds)
for i in range(len(lines)):
lines[i] = self._stripper.strip(str)
return lines
def __getattr__(self, attr):
return getattr(self._handle, attr)
def is_empty( items ):
if( len( items ) > 0 ):
return 0
else:
return 1
class SGMLExtractor:
class LocalParser(sgmllib.SGMLParser):
def __init__(self, tags_of_interest = [] ):
sgmllib.SGMLParser.__init__(self)
self.data = ''
self._instack = []
self._tags_of_interest = []
for tag in tags_of_interest:
self._tags_of_interest.append( tag.lower() )
def handle_data(self, data):
if( not is_empty( self._instack ) ):
self.data = self.data + data
def unknown_starttag(self, tag, attrs):
lower_tag = tag.lower()
if( lower_tag in self._tags_of_interest ):
self._instack.append( lower_tag )
def unknown_endtag(self, tag ):
if( not is_empty( self._instack ) ):
open_tag = self._instack.pop()
try:
if( open_tag != tag.lower() ):
self._instack.append( open_tag )
except:
print tag
def __init__(self, tags_of_interest = [] ):
self._parser = SGMLExtractor.LocalParser( tags_of_interest )
def strip(self, str):
"""S.strip(str) -> string
Strip the SGML tags from str.
"""
if not str: # empty string, don't do anything.
return ''
# I need to make sure that I don't return an empty string if
# the buffer is not empty. This can happen if there's a newline
# character embedded within a tag. Thus, I'll first check to
# see if the last character is a newline. If it is, and it's stripped
# away, I'll add it back.
is_newline = str[-1] in ['\n', '\r']
self._parser.data = '' # clear the parser's data (don't reset)
self._parser.feed(str)
if self._parser.data:
str = self._parser.data
elif is_newline:
str = '\n'
return str
|