File: pyRDFa.py

package info (click to toggle)
virtuoso-opensource 6.1.4%2Bdfsg1-7
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 245,116 kB
  • sloc: ansic: 639,631; sql: 439,225; xml: 287,085; java: 61,048; sh: 38,723; cpp: 36,889; cs: 25,240; php: 12,562; yacc: 9,036; lex: 7,149; makefile: 6,093; jsp: 4,447; awk: 1,643; perl: 1,017; ruby: 1,003; python: 329
file content (136 lines) | stat: -rw-r--r-- 5,166 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python
"""
Run the pyRdfa package on string containing the RDFa markup.
"""
import sys, getopt, platform, StringIO

from pyRdfa import processFile, processURI, parseRDFa, RDFaError, Options, _open_URI
from pyRdfa.transform.MetaName              	import meta_transform
from pyRdfa.transform.OpenID                	import OpenID_transform
from pyRdfa.transform.DublinCore            	import DC_transform
from pyRdfa.transform.Prefix		 	import set_prefixes, handle_vars
from pyRdfa.Options				import Options, DIST_NS, _add_to_comment_graph, ERROR, GENERIC_XML, XHTML_RDFA, HTML5_RDFA
from rdflib.Graph				import Graph

import xml.dom.minidom

__switch = {
	("http://www.w3.org/1999/xhtml","html") : XHTML_RDFA,
	("http://www.w3.org/2000/svg","svg")    : GENERIC_XML
}

def _processString(str, outputFormat, options, base, rdfOutput) :
	def __register_XML_serializer(formatstring) :
		"""The default XML Serializer of RDFlib is buggy, mainly when handling lists.
		An L{own version<serializers.PrettyXMLSerializer>} is 
		registered in RDFlib and used in the rest of the package.
		@param formatstring: the string to identify this serializer with.
		"""
		from rdflib.plugin import register
		from rdflib.syntax import serializer, serializers
		register(formatstring, serializers.Serializer, "pyRdfa.serializers.PrettyXMLSerializer", "PrettyXMLSerializer")

	def __register_Turtle_serializer(formatstring) :
		"""The default Turtle Serializers of RDFlib is buggy and not very nice as far as the output is concerned. 
		An L{own version<serializers.TurtleSerializer>} is registered in RDFLib and used in the rest of the package.
		@param formatstring: the string to identify this serializer with.
		"""
		from rdflib.plugin import register
		from rdflib.syntax import serializer, serializers
		register(formatstring, serializers.Serializer, "pyRdfa.serializers.TurtleSerializer", "TurtleSerializer")

	# Exchaning the pretty xml serializer agaist the version stored with this package
	if outputFormat == "pretty-xml"  :
		outputFormat = "my-xml"
		__register_XML_serializer(outputFormat)
	elif outputFormat == "turtle" or outputFormat == "n3" :
		outputFormat = "my-turtle"
		__register_Turtle_serializer(outputFormat)
		
	graph = Graph()
	msg = ""
	parse = xml.dom.minidom.parse
	stream = StringIO.StringIO (str)
	try :
		dom = parse(stream)
		# Try to second-guess the input type
		# This is _not_ really kosher, but the minidom is not really namespace aware...
		# In practice the goal is to have the system recognize svg content automatically
		# First see if there is a default namespace defined for the document:
		top = dom.documentElement
		if top.hasAttribute("xmlns") :
			key = (top.getAttribute("xmlns"),top.nodeName)
			if key in __switch :
				options.host_language = __switch[key]
	except :
		# XML Parsing error in the input
		(type,value,traceback) = sys.exc_info()
		if options.host_language == GENERIC_XML or options.lax == False :
			msg = 'Parsing error in input file: "%s"' % value
			raise RDFaError, msg
		else :
			# XML Parsing error in the input
			msg = 'XHTML Parsing error in input file: %s. Falling back on the HTML5 parser' % value
			
			if options != None and options.warnings : options.comment_graph.add_warning(msg)
			
			# note that if a urllib is used, the input has to be closed and reopened...
			# Now try to see if and HTML5 parser is an alternative...
			try :
				import html5lib
			except :
				# no alternative to the XHTML error, because HTML5 parser not available...
				msg2 = 'XHTML Parsing error in input file. Though parsing is lax, HTML5 parser not available' 
				raise RDFaError, msg2
				
			from html5lib import treebuilders
			parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
			parse = parser.parse
			try :
				dom = parse(stream)
				# The host language has changed
				options.host_language = HTML5_RDFA
			except :
				# Well, even the HTML5 parser could not do anything with this...
				(type,value,traceback) = sys.exc_info()
				msg2 = 'Parsing error in input file as HTML5: "%s"' % value
				msg3 = msg + '/n' + msg2
				raise RDFaError, msg3
	
	if base == "" :
		sbase = ""
	else :
		sbase = base
	parseRDFa(dom, sbase, graph, options)
	
	# Got all the graphs, serialize them
	
	try :
		if options.comment_graph.graph != None :
			# Add the content of the comment graph to the output
			graph.bind("dist",DIST_NS)
			for t in options.comment_graph.graph : graph.add(t)
		return graph.serialize(format=outputFormat)
	except :
		(type,value,traceback) = sys.exc_info()

		if rdfOutput :
			if base == "" : base = input
			return create_exception_graph("%s" % value, base, outputFormat, http=False)
		else :
			# re-raise the exception and let the caller deal with it...
			raise RDFaError("%s" % value)

		
def processString (file, base = "") :
    extras         = []
    warnings       = False
    space_preserve = True
    xhtml	   = True
    lax	           = True
    options = Options(warnings=warnings,
				  space_preserve=space_preserve,
				  transformers=extras,
				  xhtml=xhtml,
				  lax=lax)
    return _processString(file, "xml", options, base, False)