File: lynx_parse.py

package info (click to toggle)
qm 1.1.3-1
  • links: PTS
  • area: main
  • in suites: woody
  • size: 8,628 kB
  • ctags: 10,249
  • sloc: python: 41,482; ansic: 20,611; xml: 12,837; sh: 485; makefile: 226
file content (75 lines) | stat: -rw-r--r-- 1,958 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env python
#
# lynx_parse.py :
# Read a list of Lynx bookmark files, specified on the command line,
# and outputs the corresponding XBEL document.
#
# Sample usage: ./lynx_parse.py ~/bookmarks/
#    (The script requires the path to the directory where your bookmark files
#     are stored.)
#

import bookmark
import re

def parse_lynx_file(bms, input):
    """Convert a Lynx 2.8 bookmark file to XBEL, reading from the
    input file object, and write to the output file object.""" 

    # Read the whole file into memory
    data = input.read()

    # Get the title
    m = re.search("<title>(.*?)</title>", data, re.IGNORECASE)
    if m is None: title = "Untitled"
    else: title = m.group(1)

    bms.add_folder( title )
    
    hrefpat = re.compile( r"""^ \s* <li> \s*
<a \s+ href \s* = \s* "(?P<url> [^"]* )" \s*>
(?P<name> .*? ) </a>""",
    re.IGNORECASE| re.DOTALL | re.VERBOSE | re.MULTILINE)
    pos = 0
    while 1:
        m = hrefpat.search(data, pos)
        if m is None: break
        pos = m.end()
        url, name = m.group(1,2)
        bms.add_bookmark( name, href = url)

    bms.leave_folder()

if __name__ == '__main__':
    import sys, glob

    if len(sys.argv)<2 or len(sys.argv)>3:
        print
        print "A simple utility to convert Lynx bookmarks to XBEL."
        print
        print "Usage: "        
        print "  lynx_parse.py <lynx-directory> [<xbel-file>]"
        sys.exit(1)        

    bms = bookmark.Bookmarks()

    # Determine the owner on Unix platforms
    import os, pwd
    uid = os.getuid()
    t = pwd.getpwuid( uid )
    bms.owner = t[4]

    glob_pattern = os.path.join(sys.argv[1], '*.html')
    file_list = glob.glob( glob_pattern )
    for file in file_list:
        input = open(file)
        parse_lynx_file(bms, input)

    if len(sys.argv)==3:
        out=open(sys.argv[2],"w")
        bms.dump_xbel(out)
        out.close()
    else:
        bms.dump_xbel()
        
    # Done