# -*- coding: UTF-8 -*- # # imports.py # # Copyright 2009-2016 Giuseppe Penone # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. import HTMLParser, htmlentitydefs import gtk, gio, os, xml.dom.minidom, re, base64, urllib2, binascii, shutil, glob, time import cons, machines, support def get_internal_link_from_http_url(link_url): """Get internal cherrytree link attribute from HTTP link url""" if link_url[0:4] == "http": return "webs %s" % link_url elif link_url[0:7] == "file://": return "file %s" % base64.b64encode(link_url[7:]) else: return "webs %s" % ("http://" + link_url) def get_web_links_offsets_from_plain_text(plain_text): """Parse plain text for possible web links""" web_links = [] max_end_offset = len(plain_text) max_start_offset = max_end_offset - 7 start_offset = 0 while start_offset < max_start_offset: if support.get_first_chars_of_string_at_offset_are(plain_text, start_offset, cons.WEB_LINK_STARTERS): end_offset = start_offset + 3 while (end_offset < max_end_offset)\ and (plain_text[end_offset] not in [cons.CHAR_SPACE, cons.CHAR_NEWLINE]): end_offset += 1 web_links.append([start_offset, end_offset]) start_offset = end_offset + 1 else: start_offset += 1 return web_links def epim_html_file_to_hier_files(filepath): """From EPIM HTML File to Folder of HTML Files""" epim_dir = os.path.join(cons.TMP_FOLDER, "EPIM") if os.path.isdir(epim_dir): shutil.rmtree(epim_dir) curr_state = 0 nodes_levels = [] nodes_names = [] nodes_content = [] curr_node_idx = 0 html_prefix = "" with open(filepath, 'r') as fd: for html_row in fd: if curr_state == 0: html_prefix += html_row if "" in html_row: curr_state = 1 elif curr_state == 1: if not "href" in html_row: curr_state = 2 continue nodes_levels.append(html_row.count(" ")) elif curr_state == 2: if html_row.startswith(""): nodes_names.append(support.clean_from_chars_not_for_filename(html_row[18:-14])) nodes_content.append("") curr_state = 3 elif curr_state == 3: if html_row.startswith("

# 1: waiting for node name # 2: waiting for node level # 3: gathering node content treepad_vec = treepad_string.split(cons.CHAR_CR+cons.CHAR_NEWLINE) for text_line in treepad_vec: if self.curr_state == 0: if len(text_line) > 5 and text_line[:6] == "": self.curr_state = 1 elif self.curr_state == 1: #print "node name", text_line self.curr_node_name = text_line self.curr_state = 2 elif self.curr_state == 2: #print "node level", text_line if re.match("\d+", text_line): self.curr_node_level = int(text_line) #print self.curr_node_level if self.curr_node_level <= self.former_node_level: for count in range(self.former_node_level - self.curr_node_level): self.nodes_list.pop() self.nodes_list.pop() self.former_node_level = self.curr_node_level self.curr_node_content = "" self.curr_state = 3 self.nodes_list.append(self.dom.createElement("node")) self.nodes_list[-1].setAttribute("name", self.curr_node_name) self.nodes_list[-1].setAttribute("prog_lang", cons.RICH_TEXT_ID) self.nodes_list[-2].appendChild(self.nodes_list[-1]) else: self.curr_node_name += text_line + cons.CHAR_SPACE elif self.curr_state == 3: if len(text_line) > 9 and text_line[:10] == "": self.curr_state = 0 self.rich_text_serialize(self.curr_node_content) else: self.curr_node_content += text_line + cons.CHAR_NEWLINE def get_cherrytree_xml(self, treepad_string): """Returns a CherryTree string Containing the Treepad Nodes""" self.dom = xml.dom.minidom.Document() self.nodes_list = [self.dom.createElement(cons.APP_NAME)] self.dom.appendChild(self.nodes_list[0]) self.parse_string_lines(treepad_string) return self.dom.toxml() class PlainTextHandler: """The Handler of the Plain Text Parsing""" def __init__(self, dad): """Machine boot""" self.dad = dad def rich_text_serialize(self, text_data): """Appends a new part to the XML rich text""" dom_iter = self.dom.createElement("rich_text") self.nodes_list[-1].appendChild(dom_iter) text_iter = self.dom.createTextNode(text_data) dom_iter.appendChild(text_iter) def add_folder(self, folderpath): """Add nodes from plain text files in a Folder""" for element in sorted(os.listdir(folderpath)): full_element = os.path.join(folderpath, element) if os.path.isfile(full_element): gio_file = gio.File(full_element) gio_file_info = gio_file.query_info("*") if not cons.IS_WIN_OS: mime_types = str(gio_file_info.get_icon()) if "text-" in mime_types: self.add_file(full_element) else: mime_type = gio_file_info.get_content_type() if mime_type in ["."+self.dad.ext_plain_import.lower(), "."+self.dad.ext_plain_import.upper()]: self.add_file(full_element) elif os.path.isdir(full_element): self.add_node_with_content(full_element, "") self.add_folder(full_element) self.nodes_list.pop() def add_file(self, filepath): """Add node from one plain text File""" file_content = "" try: file_descriptor = open(filepath, 'r') file_content = file_descriptor.read() file_descriptor.close() except: print "skip import of", filepath return self.add_node_with_content(filepath, support.auto_decode_str(file_content)) self.nodes_list.pop() def add_node_with_content(self, filepath, file_content): """Append Node and Fill Content""" self.nodes_list.append(self.dom.createElement("node")) node_name = os.path.basename(filepath) if node_name.lower().endswith("."+self.dad.ext_plain_import.lower()): len_ext = 1+len(self.dad.ext_plain_import) node_name = node_name[:-len_ext] self.nodes_list[-1].setAttribute("name", node_name) self.nodes_list[-1].setAttribute("prog_lang", cons.RICH_TEXT_ID) self.nodes_list[-2].appendChild(self.nodes_list[-1]) self.rich_text_serialize(file_content) def get_cherrytree_xml(self, filepath="", folderpath=""): """Returns a CherryTree string Containing the Plain Text Nodes""" self.dom = xml.dom.minidom.Document() self.nodes_list = [self.dom.createElement(cons.APP_NAME)] self.dom.appendChild(self.nodes_list[0]) if filepath: self.add_file(filepath) else: self.add_folder(folderpath) return self.dom.toxml() class MempadHandler: """The Handler of the Mempad File Parsing""" def __init__(self): """Machine boot""" self.xml_handler = machines.XMLHandler(self) def rich_text_serialize(self, text_data): """Appends a new part to the XML rich text""" dom_iter = self.dom.createElement("rich_text") self.nodes_list[-1].appendChild(dom_iter) text_iter = self.dom.createTextNode(text_data) dom_iter.appendChild(text_iter) def parse_binary_bytes(self, file_descriptor): """Parse the binary bytes one by one""" self.curr_state = 0 self.curr_node_name = "" self.curr_node_content = "" self.curr_node_level = 0 self.former_node_level = -1 # 0: waiting for first node level # 1: filling node name # 2: filling node content # 3: waiting for subsequent node level all_data = file_descriptor.read() for element in all_data: if self.curr_state == 0: if ord(element) == 1: self.curr_node_level = 1 self.curr_state = 1 elif self.curr_state == 1: if ord(element) == 0: #print self.curr_node_name self.curr_node_content = "" self.curr_state = 2 if self.curr_node_level <= self.former_node_level: for count in range(self.former_node_level - self.curr_node_level): self.nodes_list.pop() self.nodes_list.pop() self.former_node_level = self.curr_node_level self.nodes_list.append(self.dom.createElement("node")) self.nodes_list[-1].setAttribute("name", self.curr_node_name) self.nodes_list[-1].setAttribute("prog_lang", cons.RICH_TEXT_ID) self.nodes_list[-2].appendChild(self.nodes_list[-1]) else: self.curr_node_name += element elif self.curr_state == 2: if ord(element) == 0: #print self.curr_node_content self.curr_state = 3 self.rich_text_serialize(self.curr_node_content) else: self.curr_node_content += element else: self.curr_node_level = ord(element) self.curr_node_name = "" self.curr_state = 1 def get_cherrytree_xml(self, file_descriptor): """Returns a CherryTree string Containing the Mempad Nodes""" self.dom = xml.dom.minidom.Document() self.nodes_list = [self.dom.createElement(cons.APP_NAME)] self.dom.appendChild(self.nodes_list[0]) self.parse_binary_bytes(file_descriptor) return self.dom.toxml() class NotecaseHandler(HTMLParser.HTMLParser): """The Handler of the Notecase Files Parsing""" def __init__(self, dad): """Machine boot""" HTMLParser.HTMLParser.__init__(self) self.dad = dad self.xml_handler = machines.XMLHandler(self) def rich_text_serialize(self, text_data): """Appends a new part to the XML rich text""" dom_iter = self.dom.createElement("rich_text") for tag_property in cons.TAG_PROPERTIES: if self.curr_attributes[tag_property] != "": dom_iter.setAttribute(tag_property, self.curr_attributes[tag_property]) self.nodes_list[-1].appendChild(dom_iter) text_iter = self.dom.createTextNode(text_data) dom_iter.appendChild(text_iter) def handle_starttag(self, tag, attrs): """Encountered the beginning of a tag""" if self.curr_state == 0: if tag == "dt": # waiting for the title # got dt, we go state 0->1 self.curr_state = 1 elif self.curr_state == 2: if tag == "dl": # the current node becomes parent for pixbuf_element in self.pixbuf_vector: self.xml_handler.pixbuf_element_to_xml(pixbuf_element, self.nodes_list[-1], self.dom) self.pixbuf_vector = [] self.chars_counter = 0 # got dl, we go state 2->0 and wait for the child self.curr_state = 0 elif tag == "dt": # the current node has no more job to do for pixbuf_element in self.pixbuf_vector: self.xml_handler.pixbuf_element_to_xml(pixbuf_element, self.nodes_list[-1], self.dom) self.pixbuf_vector = [] self.chars_counter = 0 self.nodes_list.pop() # waiting for the title # got dt, we go state 2->1 self.curr_state = 1 elif tag == "b": self.curr_attributes[cons.TAG_WEIGHT] = cons.TAG_PROP_HEAVY elif tag == "i": self.curr_attributes[cons.TAG_STYLE] = cons.TAG_PROP_ITALIC elif tag == "u": self.curr_attributes[cons.TAG_UNDERLINE] = cons.TAG_PROP_SINGLE elif tag == "s": self.curr_attributes[cons.TAG_STRIKETHROUGH] = cons.TAG_PROP_TRUE elif tag == "span" and attrs[0][0] == cons.TAG_STYLE: match = re.match("(?<=^)(.+):(.+)(?=$)", attrs[0][1]) if match != None: if match.group(1) == "color": self.curr_attributes[cons.TAG_FOREGROUND] = match.group(2).strip() self.latest_span.append(cons.TAG_FOREGROUND) elif match.group(1) == "background-color": self.curr_attributes[cons.TAG_BACKGROUND] = match.group(2).strip() self.latest_span.append(cons.TAG_BACKGROUND) elif tag == "a" and len(attrs) > 0: link_url = attrs[0][1] if len(link_url) > 7: self.curr_attributes[cons.TAG_LINK] = get_internal_link_from_http_url(link_url) elif tag == "br": # this is a data block composed only by an endline self.rich_text_serialize(cons.CHAR_NEWLINE) self.chars_counter += 1 elif tag == "li": self.rich_text_serialize(cons.CHAR_NEWLINE+self.dad.chars_listbul[0]+cons.CHAR_SPACE) self.chars_counter += 3 elif tag in ["img", "v:imagedata"] and len(attrs) > 0: for attribute in attrs: if attribute[0] == "src": if attribute[1][:23] == "data:image/jpeg;base64,": jpeg_data = attribute[1][23:] pixbuf_loader = gtk.gdk.pixbuf_loader_new_with_mime_type("image/jpeg") try: pixbuf_loader.write(base64.b64decode(jpeg_data)) except: try: pixbuf_loader.write(base64.b64decode(jpeg_data + "=")) except: pixbuf_loader.write(base64.b64decode(jpeg_data + "==")) pixbuf_loader.close() pixbuf = pixbuf_loader.get_pixbuf() self.pixbuf_vector.append([self.chars_counter, pixbuf, cons.TAG_PROP_LEFT]) self.chars_counter += 1 elif attribute[1][:22] == "data:image/png;base64,": png_data = attribute[1][22:] pixbuf_loader = gtk.gdk.pixbuf_loader_new_with_mime_type("image/png") try: pixbuf_loader.write(base64.b64decode(png_data)) except: try: pixbuf_loader.write(base64.b64decode(png_data + "=")) except: pixbuf_loader.write(base64.b64decode(png_data + "==")) pixbuf_loader.close() pixbuf = pixbuf_loader.get_pixbuf() self.pixbuf_vector.append([self.chars_counter, pixbuf, cons.TAG_PROP_LEFT]) self.chars_counter += 1 def handle_endtag(self, tag): """Encountered the end of a tag""" if self.curr_state == 1: if tag == "dt": # title reception complete self.nodes_list.append(self.dom.createElement("node")) self.nodes_list[-1].setAttribute("name", self.curr_title) self.nodes_list[-1].setAttribute("prog_lang", cons.RICH_TEXT_ID) self.nodes_list[-2].appendChild(self.nodes_list[-1]) self.curr_title = "" # waiting for data if self.chars_counter > 0: # this means the new node is child of the previous, so we did not pop for pixbuf_element in self.pixbuf_vector: self.xml_handler.pixbuf_element_to_xml(pixbuf_element, self.nodes_list[-2], self.dom) self.pixbuf_vector = [] self.chars_counter = 0 # got dd, we go state 1->2 self.curr_state = 2 elif self.curr_state == 2: if tag == "dd": # the current node has no more job to do for pixbuf_element in self.pixbuf_vector: self.xml_handler.pixbuf_element_to_xml(pixbuf_element, self.nodes_list[-1], self.dom) self.pixbuf_vector = [] self.chars_counter = 0 self.nodes_list.pop() # got /dd, we go state 2->0 and wait for a brother self.curr_state = 0 elif tag == "dl": # the current node has no more job to do for pixbuf_element in self.pixbuf_vector: self.xml_handler.pixbuf_element_to_xml(pixbuf_element, self.nodes_list[-1], self.dom) self.pixbuf_vector = [] self.chars_counter = 0 self.nodes_list.pop() self.nodes_list.pop() # got /dl, we go state 2->0 and wait for a parent's sibling self.curr_state = 0 elif tag == "b": self.curr_attributes[cons.TAG_WEIGHT] = "" elif tag == "i": self.curr_attributes[cons.TAG_STYLE] = "" elif tag == "u": self.curr_attributes[cons.TAG_UNDERLINE] = "" elif tag == "s": self.curr_attributes[cons.TAG_STRIKETHROUGH] = "" elif tag == "span": if self.latest_span: if self.latest_span[-1] == cons.TAG_FOREGROUND: self.curr_attributes[cons.TAG_FOREGROUND] = "" elif self.latest_span[-1] == cons.TAG_BACKGROUND: self.curr_attributes[cons.TAG_BACKGROUND] = "" del self.latest_span[-1] elif tag == "a": self.curr_attributes[cons.TAG_LINK] = "" elif tag == "dl": # backward one level in nodes list for pixbuf_element in self.pixbuf_vector: self.xml_handler.pixbuf_element_to_xml(pixbuf_element, self.nodes_list[-1], self.dom) self.pixbuf_vector = [] self.chars_counter = 0 self.nodes_list.pop() def handle_data(self, data): """Found Data""" if self.curr_state == 0 or data in [cons.CHAR_NEWLINE, cons.CHAR_NEWLINE*2]: return if self.curr_state == 1: # state 1 got title self.curr_title += data elif self.curr_state == 2: # state 2 got data clean_data = data.replace(cons.CHAR_NEWLINE, "") self.rich_text_serialize(clean_data) self.chars_counter += len(clean_data) def handle_entityref(self, name): """Found Entity Reference like &name;""" if name in htmlentitydefs.name2codepoint: unicode_char = unichr(htmlentitydefs.name2codepoint[name]) else: return if self.curr_state == 1: # state 1 got title self.curr_title += unicode_char elif self.curr_state == 2: # state 2 got data self.rich_text_serialize(unicode_char) self.chars_counter += 1 def get_cherrytree_xml(self, input_string): """Parses the Given Notecase HTML String feeding the CherryTree XML dom""" self.dom = xml.dom.minidom.Document() self.nodes_list = [self.dom.createElement(cons.APP_NAME)] self.dom.appendChild(self.nodes_list[0]) self.curr_state = 0 self.curr_title = "" self.curr_attributes = {} for tag_property in cons.TAG_PROPERTIES: self.curr_attributes[tag_property] = "" self.latest_span = [] # curr_state 0: standby, taking no data # curr_state 1: waiting for node title, take one data # curr_state 2: waiting for node content, take many data self.pixbuf_vector = [] self.chars_counter = 0 self.feed(input_string.decode(cons.STR_UTF8, cons.STR_IGNORE)) return self.dom.toxml() class HTMLHandler(HTMLParser.HTMLParser): """The Handler of the HTML received from clipboard""" def __init__(self, dad): """Machine boot""" self.dad = dad self.monitored_tags = ["p", "b", "i", "u", "s", cons.TAG_PROP_H1, cons.TAG_PROP_H2, cons.TAG_PROP_H3, "span", "font"] HTMLParser.HTMLParser.__init__(self) def rich_text_serialize(self, text_data): """Appends a new part to the XML rich text""" dom_iter = self.dom.createElement("rich_text") for tag_property in cons.TAG_PROPERTIES: if self.curr_attributes[tag_property] != "": dom_iter.setAttribute(tag_property, self.curr_attributes[tag_property]) self.nodes_list[-1].appendChild(dom_iter) text_iter = self.dom.createTextNode(text_data) dom_iter.appendChild(text_iter) self.chars_counter += len(text_data) def get_rgb_gtk_attribute(self, html_attribute): """Get RGB GTK attribute from HTML attribute""" html_attribute_key = html_attribute.strip().lower() #print "html_attribute_key", html_attribute_key if html_attribute_key[0] == "#": return html_attribute_key if html_attribute_key in cons.HTML_COLOR_NAMES: return cons.HTML_COLOR_NAMES[html_attribute_key] if "rgb" in html_attribute_key: rgb_tern = [] for i in range(3): if i == 0: parenth_start = html_attribute_key.find(cons.CHAR_PARENTH_OPEN) else: parenth_start = html_attribute_key.find(cons.CHAR_COMMA) if i == 2: parenth_end = html_attribute_key[parenth_start+1:].find(cons.CHAR_PARENTH_CLOSE) else: parenth_end = html_attribute_key[parenth_start+1:].find(cons.CHAR_COMMA) if parenth_start < 0 or parenth_end < 0: break rgb_tern.append(int(html_attribute_key[parenth_start+1:parenth_start+1+parenth_end])) html_attribute_key = html_attribute_key[parenth_start+1+parenth_end:] if len(rgb_tern) != 3: print rgb_tern return None html_attribute_key = "#%.2x%.2x%.2x" % (rgb_tern[0], rgb_tern[1], rgb_tern[2]) return html_attribute_key return None def handle_starttag(self, tag, attrs): """Encountered the beginning of a tag""" if tag in self.monitored_tags: self.in_a_tag += 1 if self.curr_state == 0: if tag == "body": self.num_bodies -= 1 if self.num_bodies == 0: self.curr_state = 1 elif self.curr_state == 1: if tag == "table": self.curr_state = 2 self.curr_table = [] self.curr_rows_span = [] self.curr_table_header = False self.curr_cell = "" elif tag == "b": self.curr_attributes[cons.TAG_WEIGHT] = cons.TAG_PROP_HEAVY elif tag == "i": self.curr_attributes[cons.TAG_STYLE] = cons.TAG_PROP_ITALIC elif tag == "u": self.curr_attributes[cons.TAG_UNDERLINE] = cons.TAG_PROP_SINGLE elif tag == "s": self.curr_attributes[cons.TAG_STRIKETHROUGH] = cons.TAG_PROP_TRUE elif tag == cons.TAG_STYLE: self.curr_state = 0 elif tag == "span": for attr in attrs: if attr[0] == cons.TAG_STYLE: attributes = attr[1].split(";") for attribute in attributes: #print "attribute", attribute colon_pos = attribute.find(cons.CHAR_COLON) if colon_pos < 0: continue attr_name = attribute[:colon_pos].strip().lower() attr_value = attribute[colon_pos+1:].strip().lower() #print attr_name, attr_value if attr_name == "color": attribute = self.get_rgb_gtk_attribute(attr_value) if attribute: self.curr_attributes[cons.TAG_FOREGROUND] = attribute self.latest_span.append(cons.TAG_FOREGROUND) elif attr_name in [cons.TAG_BACKGROUND, "background-color"]: attribute = self.get_rgb_gtk_attribute(attr_value) if attribute: self.curr_attributes[cons.TAG_BACKGROUND] = attribute self.latest_span.append(cons.TAG_BACKGROUND) elif attr_name == "text-decoration": if attr_value in [cons.TAG_UNDERLINE, "underline;"]: self.curr_attributes[cons.TAG_UNDERLINE] = cons.TAG_PROP_SINGLE self.latest_span.append(cons.TAG_UNDERLINE) elif attr_value in ["line-through"]: self.curr_attributes[cons.TAG_STRIKETHROUGH] = cons.TAG_PROP_TRUE self.latest_span.append(cons.TAG_STRIKETHROUGH) elif attr_name == "font-weight": if attr_value in ["bold", "bolder", "700"]: self.curr_attributes[cons.TAG_WEIGHT] = cons.TAG_PROP_HEAVY self.latest_span.append(cons.TAG_WEIGHT) elif attr_name == "font-style": if attr_value in [cons.TAG_PROP_ITALIC]: self.curr_attributes[cons.TAG_STYLE] = cons.TAG_PROP_ITALIC self.latest_span.append(cons.TAG_STYLE) elif tag == "font": for attr in attrs: if attr[0] == "color": attribute = self.get_rgb_gtk_attribute(attr[1].strip()) if attribute: self.curr_attributes[cons.TAG_FOREGROUND] = attribute self.latest_font = cons.TAG_FOREGROUND elif tag in [cons.TAG_PROP_H1, cons.TAG_PROP_H2, cons.TAG_PROP_H3, cons.TAG_PROP_H4, cons.TAG_PROP_H5, cons.TAG_PROP_H6]: self.rich_text_serialize(cons.CHAR_NEWLINE) if tag == cons.TAG_PROP_H1: self.curr_attributes[cons.TAG_SCALE] = cons.TAG_PROP_H1 elif tag == cons.TAG_PROP_H2: self.curr_attributes[cons.TAG_SCALE] = cons.TAG_PROP_H2 else: self.curr_attributes[cons.TAG_SCALE] = cons.TAG_PROP_H3 for attr in attrs: if attr[0] == "align": self.curr_attributes[cons.TAG_JUSTIFICATION] = attr[1].strip().lower() elif tag == "a" and len(attrs) > 0: #print "attrs", attrs for attr in attrs: if attr[0] == "href": link_url = attr[1] if len(link_url) > 7: self.curr_attributes[cons.TAG_LINK] = get_internal_link_from_http_url(link_url) break elif tag == "br": self.rich_text_serialize(cons.CHAR_NEWLINE) elif tag == "ol": self.curr_list_type = ["o", 1] elif tag == "ul": self.curr_list_type = ["u", 0] elif tag == "li": if self.curr_list_type[0] == "u": self.rich_text_serialize(self.dad.chars_listbul[0]+cons.CHAR_SPACE) else: self.rich_text_serialize("%s. " % self.curr_list_type[1]) self.curr_list_type[1] += 1 elif tag in ["img", "v:imagedata"] and len(attrs) > 0: dic_attrs = dict(a for a in attrs) img_path = dic_attrs.get('src', "") self.insert_image(img_path) elif tag == "pre": self.pre_tag = "p" elif self.curr_state == 2: if tag == "table": # nested tables self.curr_table = [] self.curr_rows_span = [] self.curr_table_header = False self.curr_cell = "" elif tag == "tr": if self.curr_cell and self.curr_table: # case of not closed self.curr_table[-1].append(self.curr_cell) self.curr_cell = "" self.curr_table.append([]) elif tag in ["td", "th"]: if not self.curr_table: # case of first missing , this is the header even if self.curr_table.append([]) self.curr_table_header = True self.curr_cell = "" self.curr_rowspan = 1 for attr in attrs: if attr[0] == "rowspan": self.curr_rowspan = int(attr[1]) if tag == "th": self.curr_table_header = True elif tag in ["img", "v:imagedata"] and len(attrs) > 0: dic_attrs = dict(a for a in attrs) img_path = dic_attrs.get('src', "") self.insert_image(img_path, cons.CHAR_NEWLINE*2) elif tag == "br": self.curr_cell += cons.CHAR_NEWLINE elif tag == "ol": self.curr_list_type = ["o", 1] elif tag == "ul": self.curr_list_type = ["u", 0] elif tag == "li": if self.curr_list_type[0] == "u": self.curr_cell += self.dad.chars_listbul[0]+cons.CHAR_SPACE else: self.curr_cell += "%s. " % self.curr_list_type[1] self.curr_list_type[1] += 1 def insert_image(self, img_path, trailing_chars=""): """Insert Image in Buffer""" try: self.dad.statusbar.push(self.dad.statusbar_context_id, _("Downloading") + " %s ..." % img_path) while gtk.events_pending(): gtk.main_iteration() url_desc = urllib2.urlopen(img_path, timeout=3) image_file = url_desc.read() pixbuf_loader = gtk.gdk.PixbufLoader() pixbuf_loader.write(image_file) pixbuf_loader.close() pixbuf = pixbuf_loader.get_pixbuf() self.dad.xml_handler.pixbuf_element_to_xml([self.chars_counter, pixbuf, cons.TAG_PROP_LEFT], self.nodes_list[-1], self.dom) self.chars_counter += 1 self.dad.statusbar.pop(self.dad.statusbar_context_id) if trailing_chars: self.rich_text_serialize(trailing_chars) except: if os.path.isfile(os.path.join(self.local_dir, img_path)): pixbuf = gtk.gdk.pixbuf_new_from_file(os.path.join(self.local_dir, img_path)) self.dad.xml_handler.pixbuf_element_to_xml([self.chars_counter, pixbuf, cons.TAG_PROP_LEFT], self.nodes_list[-1], self.dom) self.chars_counter += 1 else: print "failed download of", img_path self.dad.statusbar.pop(self.dad.statusbar_context_id) def handle_endtag(self, tag): """Encountered the end of a tag""" if tag in self.monitored_tags: self.in_a_tag -= 1 if self.curr_state == 0: if tag == cons.TAG_STYLE: self.curr_state = 1 if self.curr_state == 1: if tag == "p": self.rich_text_serialize(cons.CHAR_NEWLINE) elif tag == "b": self.curr_attributes[cons.TAG_WEIGHT] = "" elif tag == "i": self.curr_attributes[cons.TAG_STYLE] = "" elif tag == "u": self.curr_attributes[cons.TAG_UNDERLINE] = "" elif tag == "s": self.curr_attributes[cons.TAG_STRIKETHROUGH] = "" elif tag == "span": if self.latest_span: if self.latest_span[-1] == cons.TAG_FOREGROUND: self.curr_attributes[cons.TAG_FOREGROUND] = "" elif self.latest_span[-1] == cons.TAG_BACKGROUND: self.curr_attributes[cons.TAG_BACKGROUND] = "" elif self.latest_span[-1] == cons.TAG_UNDERLINE: self.curr_attributes[cons.TAG_UNDERLINE] = "" elif self.latest_span[-1] == cons.TAG_STRIKETHROUGH: self.curr_attributes[cons.TAG_STRIKETHROUGH] = "" elif self.latest_span[-1] == cons.TAG_WEIGHT: self.curr_attributes[cons.TAG_WEIGHT] = "" elif self.latest_span[-1] == cons.TAG_STYLE: self.curr_attributes[cons.TAG_STYLE] = "" del self.latest_span[-1] elif tag == "font": if self.latest_font == cons.TAG_FOREGROUND: self.curr_attributes[cons.TAG_FOREGROUND] = "" elif tag in [cons.TAG_PROP_H1, cons.TAG_PROP_H2, cons.TAG_PROP_H3, cons.TAG_PROP_H4, cons.TAG_PROP_H5, cons.TAG_PROP_H6]: self.curr_attributes[cons.TAG_SCALE] = "" self.curr_attributes[cons.TAG_JUSTIFICATION] = "" self.rich_text_serialize(cons.CHAR_NEWLINE) elif tag == "a": self.curr_attributes[cons.TAG_LINK] = "" elif tag == "li": self.rich_text_serialize(cons.CHAR_NEWLINE) elif tag == "pre": self.pre_tag = "" elif self.curr_state == 2: if tag in ["td", "th"]: self.curr_table[-1].append(self.curr_cell) self.curr_cell = "" if len(self.curr_table) == 1: self.curr_rows_span.append(self.curr_rowspan) else: index = len(self.curr_table[-1])-1 #print "self.curr_rows_span", self.curr_rows_span while index >= len(self.curr_rows_span): # rowspan in very first row self.curr_rows_span.append(1) self.curr_table[-2].append("") if self.curr_rows_span[index] == 1: self.curr_rows_span[index] = self.curr_rowspan else: unos_found = 0 while unos_found < 2: if not unos_found: self.curr_table[-1].insert(index, "") else: self.curr_table[-1].append("") self.curr_rows_span[index] -= 1 index += 1 if index == len(self.curr_rows_span): break if self.curr_rows_span[index] == 1: unos_found += 1 if unos_found < 2: index += 1 if index == len(self.curr_rows_span): break if self.curr_rows_span[index] == 1: unos_found += 1 elif tag == "table": self.curr_state = 1 if not self.curr_table[-1]: # case of latest without any afterwards del self.curr_table[-1] if len(self.curr_table) == 1 and len(self.curr_table[0]) == 1: # it's a codebox text_inside_codebox = self.curr_table[0][0].strip() if text_inside_codebox: codebox_dict = { 'frame_width': 300, 'frame_height': 150, 'width_in_pixels': True, 'syntax_highlighting': cons.PLAIN_TEXT_ID, 'highlight_brackets': False, 'show_line_numbers': False, 'fill_text': text_inside_codebox } self.dad.xml_handler.codebox_element_to_xml([self.chars_counter, codebox_dict, cons.TAG_PROP_LEFT], self.nodes_list[-1], self.dom) self.chars_counter += 1 else: print "empty codebox skip" else: # it's a table if not self.curr_table_header: self.curr_table.append([_("click me")]*len(self.curr_table[0])) else: self.curr_table.append(self.curr_table.pop(0)) table_dict = {'col_min': cons.TABLE_DEFAULT_COL_MIN, 'col_max': cons.TABLE_DEFAULT_COL_MAX, 'matrix': self.curr_table} self.dad.xml_handler.table_element_to_xml([self.chars_counter, table_dict, cons.TAG_PROP_LEFT], self.nodes_list[-1], self.dom) self.chars_counter += 1 self.rich_text_serialize(cons.CHAR_NEWLINE) elif tag in ["p", "li"]: self.curr_cell += cons.CHAR_NEWLINE def handle_data(self, data): """Found Data""" if self.curr_state == 0: return if self.pre_tag == "p": self.rich_text_serialize(data) return if self.in_a_tag: clean_data = data.replace(cons.CHAR_NEWLINE, cons.CHAR_SPACE) else: clean_data = data.replace(cons.CHAR_NEWLINE, "") if not clean_data or clean_data == cons.CHAR_TAB: return if self.curr_state == 1: self.rich_text_serialize(clean_data.replace(cons.CHAR_TAB, cons.CHAR_SPACE)) elif self.curr_state == 2: self.curr_cell += clean_data.replace(cons.CHAR_TAB, "") def handle_entityref(self, name): """Found Entity Reference like &name;""" if self.curr_state == 0: return if name == "nbsp": unicode_char = cons.CHAR_SPACE elif name in htmlentitydefs.name2codepoint: unicode_char = unichr(htmlentitydefs.name2codepoint[name]) else: return if self.curr_state == 1: self.rich_text_serialize(unicode_char) elif self.curr_state == 2: self.curr_cell += unicode_char def handle_charref(self, name): """decimal and hexadecimal numeric character references of the form &#NNN; and &#xNNN;""" if self.curr_state == 0: return if name[0] in ['x', 'X']: unicode_num = int(name[1:], 16) else: unicode_num = int(name) if unicode_num == 160: # nbsp unicode_num = 32 # space unicode_char = unichr(unicode_num) if self.curr_state == 1: self.rich_text_serialize(unicode_char) elif self.curr_state == 2: self.curr_cell += unicode_char def get_clipboard_selection_xml(self, input_string): """Parses the Given HTML String feeding the XML dom""" self.dom = xml.dom.minidom.Document() self.nodes_list = [self.dom.createElement("root")] self.dom.appendChild(self.nodes_list[0]) self.nodes_list.append(self.dom.createElement("slot")) self.nodes_list[0].appendChild(self.nodes_list[-1]) self.boot_n_feed(input_string, "") return self.dom.toxml() def boot_n_feed(self, input_string, local_dir): """Init variables and start feed""" self.curr_state = 0 self.local_dir = local_dir self.curr_attributes = {} for tag_property in cons.TAG_PROPERTIES: self.curr_attributes[tag_property] = "" self.latest_span = [] self.latest_font = "" self.curr_cell = "" self.in_a_tag = 0 self.chars_counter = 0 self.curr_list_type = ["u", 0] self.pre_tag = "" # curr_state 0: standby, taking no data # curr_state 1: receiving rich text # curr_state 2: receiving table or codebox data if not HTMLCheck().is_html_ok(input_string): input_string = cons.HTML_HEADER % "" + input_string + cons.HTML_FOOTER #print "###############" #print input_string #with open("clipboard.log", 'w') as fd: #fd.write(input_string) #print "###############" self.num_bodies = len(re.findall("]*>", input_string, re.IGNORECASE)) self.feed(input_string) def add_folder(self, folderpath): """Add nodes from HTML files in a Folder""" folder_file_same_name = "" for element in sorted(os.listdir(folderpath)): if folder_file_same_name and folder_file_same_name == element: folder_file_same_name = "" continue full_element = os.path.join(folderpath, element) if os.path.isfile(full_element): gio_file = gio.File(full_element) gio_file_info = gio_file.query_info("*") if not cons.IS_WIN_OS: mime_types = str(gio_file_info.get_icon()).lower() if "html" in mime_types: self.add_file(full_element) else: mime_type = gio_file_info.get_content_type() if mime_type.lower() in [".html", ".htm"]: self.add_file(full_element) elif os.path.isdir(full_element): if os.path.isfile(full_element+".htm"): folder_file_same_name = element+".htm" self.add_file(full_element+".htm", do_pop=False) else: self.add_node_with_content(full_element, "") self.add_folder(full_element) self.nodes_list.pop() def add_file(self, filepath, do_pop=True): """Add node from one HTML File""" file_content = "" try: file_descriptor = open(filepath, 'r') file_content = file_descriptor.read() file_descriptor.close() except: print "skip import of", filepath return self.add_node_with_content(filepath, "") self.boot_n_feed(support.auto_decode_str(file_content), os.path.dirname(filepath)) if do_pop: self.nodes_list.pop() def add_node_with_content(self, filepath, file_content): """Append Node and Fill Content""" self.nodes_list.append(self.dom.createElement("node")) node_name = os.path.basename(filepath) if node_name.lower().endswith(".htm"): node_name = node_name[:-4] elif node_name.lower().endswith(".html"): node_name = node_name[:-5] #print node_name, len(self.nodes_list) self.nodes_list[-1].setAttribute("name", node_name) self.nodes_list[-1].setAttribute("prog_lang", cons.RICH_TEXT_ID) self.nodes_list[-2].appendChild(self.nodes_list[-1]) if file_content: self.rich_text_serialize(file_content) def get_cherrytree_xml(self, filepath="", folderpath=""): """Returns a CherryTree string Containing the HTML Nodes""" self.dom = xml.dom.minidom.Document() self.nodes_list = [self.dom.createElement(cons.APP_NAME)] self.dom.appendChild(self.nodes_list[0]) if filepath: self.add_file(filepath) else: self.add_folder(folderpath) return self.dom.toxml() class HTMLCheck(HTMLParser.HTMLParser): """Check for Minimal Tags""" def __init__(self): """Machine boot""" HTMLParser.HTMLParser.__init__(self) def handle_starttag(self, tag, attrs): """Encountered the beginning of a tag""" if tag == "html" and self.steps == 0: self.steps = 1 elif tag == "head" and self.steps == 1: self.steps = 4 elif tag == "body" and self.steps == 5: self.steps = 6 def handle_endtag(self, tag): """Encountered the end of a tag""" if tag == "head" and self.steps == 4: self.steps = 5 elif tag == "body" and self.steps == 6: self.steps = 7 if tag == "html" and self.steps == 7: self.steps = 8 def is_html_ok(self, input_string): """Checks for the minimal html tags""" self.steps = 0 self.feed(input_string) return self.steps == 8