#!/usr/bin/python3 # -*- coding: utf-8 -*- # $Id: getentityname.py 33777 2008-08-08 09:20:06Z toms $ # # Copyright (C) 2012-2015 SUSE Linux GmbH # # Author: # Thomas Schraitle # import os.path import sys import optparse __proc__ = os.path.basename(sys.argv[0]) __version__ = "$Revision: 33777 $"[11:-2] __author__="Thomas Schraitle " __license__="GPL" __doc__= """%(cmd)s [OPTIONS] XMLFILE(S) This script finds every external entity in the internal subset of the DTD, for example: %%entities; %%foo.ent; ]> The output will be: entity-decl.ent foo.ent The script detects XML comments inside the internal subset and removes them. """.format(cmd=__proc__) from xml.sax import make_parser, handler, SAXParseException import tempfile # Contains the filenames found of every external entities: resultEntities=[] class MyEntityResolver(handler.EntityResolver): """Prints the system identifier of an external entity""" def __init__(self, filename, ents=[]): """Constructor of this class filename - XML file to be investigated """ # Create a temporary file object self.tmpfile=tempfile.NamedTemporaryFile() self.filename=os.path.abspath(filename) self.ents = ents def getEntityList(self): return self.ents def resolveEntity(self, publicId, systemId): """Print the system identifier only, ignoring the public""" if publicId==None: dirname = os.path.dirname(self.filename) if not systemId in resultEntities: # Remove # to enable absolute paths in self.ents list: #self.ents.append( os.path.abspath(os.path.join( dirname, systemId)) ) self.ents.append(systemId) return os.path.abspath(os.path.join( dirname, systemId)) elif publicId.startswith("-//OASIS//") or \ publicId.startswith("-//Novell//DTD"): # Return a temporary filename without meaningful content return self.tmpfile.name # --------------------------------------------------- #elif publicId.startswith("-//OASIS//DTD DocBook") or \ # publicId.startswith("-//OASIS//ENTITIES"): # dtdversion=publicId.split("//")[2][-3:] # filename = os.path.basename(systemId) # return "/usr/share/xml/docbook/schema/dtd/%s/%s" % (dtdversion ,filename) #elif publicId.startswith("ISO 8879:1986//"): # dtdversion=publicId.split("//")[2][-3:] # filename = os.path.basename(systemId) # print >> sys.stderr, "dtdversion=%s, filename=%s, file=%s" % \ # (dtdversion, filename, "/usr/share/xml/entities/xmlcharent/0.3/%s" % filename) # return "/usr/share/xml/entities/xmlcharent/0.3/%s" % filename else: # Should not happen... return "" def joinEnts(unique, sep, ents): """Returns a string of names, separated by 'sep'.""" if unique: return sep.join(list(set(ents))) else: return sep.join(ents) def getAllEntities(options, filenames): """Collects *all* entities in XML files""" ents=[] for f in filenames: if not os.path.exists(f): print("ERROR: File »%s« not found!" % sys.argv[1], file=sys.stderr) sys.exit(10) parser = make_parser()# ["drv_expat"] cwd=os.getcwd() os.chdir(os.path.dirname(f)) parser.setEntityResolver(MyEntityResolver(f, ents)) parser.parse(f) os.chdir(cwd) print(joinEnts(options.unique, options.separator, ents)) def getFirstEntity(options, filenames): import re # Mostly taken from xmllib.py # Regular expressions _space = '[ \t\r\n]' # whitespace _S = '%s+' % _space # One or more whitespace _opS = '%s*'% _space # Zero or more whitespace _oS = '%s?' % _space # Optional whitespace _Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*' # Valid XML name _QStr = "(?:'[^']*'|\"[^\"]*\")" # Quoted XML string _quotes = "(?:'[^']'|\"[^\"]\")" _SystemLiteral = '(?P<%s>'+_QStr+')' _PublicLiteral = '(?P<%s>"[-\'+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \ "'[-+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')" % 'pubid' _ExternalId = 'SYSTEM' +_S+'('+_SystemLiteral % 'syslit'+')' \ '|' + \ 'PUBLIC'+_S+'('+_PublicLiteral+_S+_SystemLiteral % 'psyslit'+')' # Normally, this is the EBNF from the XML Spec: # [74] PEDef ::= EntityValue | ExternalID # As we are only interested in ExternalIDs, we omit EntityValue _PEDef = _ExternalId # [72] PEDecl ::= '' _PEDecl = '' + \ _Name + ')' + _S + _PEDef + _oS + '>' __dtd = r"""%(_Name)s)""" \ r"""%(spc)s+(?PPUBLIC|SYSTEM)""" \ r"""%(spc)s+(?P.*)""" \ r"""%(spc)s+\[(?P.*)\]%(spc)s?>""" % { 'spc': _space, '_Name': _Name} doctype = re.compile(__dtd) entities = re.compile(_PEDecl) ents=[] for f in filenames: # print >> sys.stderr, "Analyzing %s" % f name = open(f, 'r') lines=[] for i in range(50): lines.append( name.readline() ) lines = "".join(lines) match_obj = re.search(__dtd, lines, re.DOTALL) if match_obj: # Only process, when the internal subset has been found: Content = match_obj.group('IntSubset') #Name = match_obj.group('Name') #Type = match_obj.group('Type') #ExternalID = match_obj.group('ExternalID') #print >> sys.stderr, "Name: %s" % Name #print >> sys.stderr,"Type: %s" % Type #print >> sys.stderr,"ExtId: %s" % ExternalID #print >> sys.stderr,"Content: %s" % Content if Content: # Only process, when there is an internal subset # Find comments without reg expressions: cs = Content.find("") if (cs < 0 and ce >= 0) or (cs >=0 and ce <0): print("ERROR: Wrong XML comment found!", file=sys.stderr) sys.exit(100) elif cs >= 0 and ce >= 0: # If XML comment found, remove the substring (cs,ce) assert cs < ce, "ERROR: Internal error. " \ "Start index of XML comment must be smaller than end index!" Content = Content[1:cs] + Content[ce+3:] # Now it's easy. Search for all entity declarations in the internal subset: x=entities.findall(Content) for j in x: if j[1][0] in ('"', "'"): # Chop quotes ents.append( j[1][1:-1] ) else: ents.append(j[1]) print(joinEnts(options.unique, options.separator, ents)) def main(): """ """ # Create global options parser. #global gparser # only need for 'help' command (optional) #import optparse parser = optparse.OptionParser(__doc__.strip(), \ version="Revision %s" % __version__[11:-2]) parser.add_option("-f", "--only-first", dest="first", action="store_true", help="Never prompt (default %default)") parser.add_option("-u", "--unique", dest="unique", action="store_false", help="Make entity filenames unique (default %default)") parser.add_option("-s", "--separator", dest="separator", help="Set the separator between consecutive filenames (default '%default'). Use '\\n' and '\\t' to insert a CR and TAB character.") parser.set_defaults( \ first=False, unique=True, separator=' ', ) options, args = parser.parse_args() if options.separator == '\\n': options.separator = '\n' elif options.separator == '\\t': options.separator = '\t' resultargs = [] for fn in args: if not os.path.isabs(fn): fn = os.path.abspath(fn) resultargs.append(fn) return parser, options, resultargs if __name__=="__main__": try: p, options, args = main() if not args: p.print_usage() sys.exit(1) if options.first: getFirstEntity(options, args) else: getAllEntities(options, args) except SAXParseException as e: print(e, file=sys.stderr) print(" ".join(resultEntities)) except IOError as e: print(e, file=sys.stderr)