# -*- coding: utf-8 -*- """ Simple transfomer for HTML5: add a @src for any @data, add a @content for the @value attribute of the element, and interpret the element. @summary: Add a top "about" to and @requires: U{RDFLib package} @organization: U{World Wide Web Consortium} @author: U{Ivan Herman} @license: This software is available for use under the U{W3C® SOFTWARE NOTICE AND LICENSE} @contact: Ivan Herman, ivan@w3.org """ """ $Id: html5.py,v 1.15 2014-11-04 13:18:48 ivan Exp $ $Date: 2014-11-04 13:18:48 $ """ from functools import reduce # The handling of datatime is a little bit more complex... better put this in a separate function for a better management from datetime import datetime import re datetime_type = "http://www.w3.org/2001/XMLSchema#dateTime" time_type = "http://www.w3.org/2001/XMLSchema#time" date_type = "http://www.w3.org/2001/XMLSchema#date" date_gYear = "http://www.w3.org/2001/XMLSchema#gYear" date_gYearMonth = "http://www.w3.org/2001/XMLSchema#gYearMonth" date_gMonthDay = "http://www.w3.org/2001/XMLSchema#gMonthDay" duration_type = "http://www.w3.org/2001/XMLSchema#duration" plain = "plain" handled_time_types = [datetime_type, time_type, date_type, date_gYear, date_gYearMonth, date_gMonthDay, duration_type] _formats = { date_gMonthDay: ["%m-%d"], date_gYearMonth: ["%Y-%m"], date_gYear: ["%Y"], date_type: ["%Y-%m-%d", "%Y-%m-%dZ"], time_type: ["%H:%M", "%H:%M:%S", "%H:%M:%SZ", "%H:%M:%S.%f"], datetime_type: ["%Y-%m-%dT%H:%M", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%MZ", "%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S.%fZ"], duration_type: ["P%dD", "P%YY%mM%dD", "P%YY%mM", "P%YY%dD", "P%YY", "P%mM", "P%mM%dD"] } _dur_times = ["%HH%MM%SS", "%HH", "%MM", "%SS", "%HH%MM", "%HH%SS", "%MM%SS"] def _format_test(string): """ Tests the string format to see whether it fits one of the time datatypes @param string: attribute value to test @return: a URI for the xsd datatype or the string 'plain' """ # Try to get the easy cases: for key in _formats: for f in _formats[key]: try : # try to check if the syntax is fine _d = datetime.strptime(string, f) # bingo! return key except ValueError: pass # Now come the special cases:-( # Check first for the duration stuff, that is the nastiest. if len(string) > 2 and (string[0] == 'P' or (string [0] == '-' and string[1] == 'P')): # this is meant to be a duration type # first of all, get rid of the leading '-' and check again if string[0] == '-': for f in _formats[duration_type]: try : # try to check if the syntax is fine _d = datetime.strptime(string, f) # bingo! return duration_type except ValueError : pass # Let us see if the value contains a separate time portion, and cut that one durs = string.split('T') if len(durs) == 2 : # yep, so we should check again dur = durs[0] tm = durs[1] # Check the duration part td = False for f in _formats[duration_type] : try : # try to check if the syntax is fine _d = datetime.strptime(dur, f) # bingo! td = True break except ValueError : pass if td == True : # Getting there... for f in _dur_times : try : # try to check if the syntax is fine _d = datetime.strptime(tm, f) # bingo! return duration_type except ValueError : pass # something went wrong return plain else: # Well, no more tricks, this is a plain type return plain # If we got here, we should check the time zone # there is a discrepancy betwen the python and the HTML5/XSD lexical string, # which means that this has to handled separately for the date and the timezone portion try: # The time-zone-less portion of the string s = string[0:-6] # The time-zone portion tz = string[-5:] try : _t = datetime.strptime(tz,"%H:%M") except ValueError : # Bummer, this is not a correct time return plain # The time-zone is fine, the datetime portion has to be checked for f in _formats[datetime_type] : try : # try to check if it is fine _d = datetime.strptime(s, f) # Bingo! return datetime_type except ValueError : pass except : pass return plain def html5_extra_attributes(node, state): """ @param node: the current node that could be modified @param state: current state @type state: L{Execution context} """ def _get_literal(Pnode): """ Get (recursively) the full text from a DOM Node. @param Pnode: DOM Node @return: string """ rc = "" for node in Pnode.childNodes: if node.nodeType == node.TEXT_NODE: rc = rc + node.data elif node.nodeType == node.ELEMENT_NODE: rc = rc + _get_literal(node) if state.options.space_preserve : return rc else : return re.sub(r'(\r| |\n|\t)+'," ",rc).strip() #return re.sub(r'(\r| |\n|\t)+',"",rc).strip() # end _getLiteral def _set_time(value) : if not node.hasAttribute("datatype"): # Check the datatype: dt = _format_test(value) if dt != plain: node.setAttribute("datatype",dt) # Finally, set the value itself node.setAttribute("content",value) # end _set_time if not node.hasAttribute("content") : # @content has top priority over the others... if node.hasAttribute("datetime") : _set_time( node.getAttribute("datetime") ) elif node.hasAttribute("dateTime") : _set_time( node.getAttribute("dateTime") ) elif node.tagName == "time" : # Note that a possible @datetime/@dateTime value has already been taken care of _set_time( _get_literal(node) ) def remove_rel(node, _state): """ If @property and @rel/@rev are on the same element, then only CURIE and URI can appear as a rel/rev value. @param node: the current node that could be modified @param state: current state @type state: L{Execution context} """ from ..termorcurie import termname def _massage_node(node,attr) : """The real work for remove_rel is done here, parametrized with @rel and @rev""" if node.hasAttribute("property") and node.hasAttribute(attr) : vals = node.getAttribute(attr).strip().split() if len(vals) != 0 : final_vals = [ v for v in vals if not termname.match(v) ] if len(final_vals) == 0 : node.removeAttribute(attr) else : node.setAttribute(attr, reduce(lambda x,y: x+' '+y,final_vals)) _massage_node(node, "rev") _massage_node(node, "rel")