File: crawl-ftp.py

package info (click to toggle)
astrometry.net 0.93%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 19,372 kB
  • sloc: ansic: 163,192; python: 18,357; makefile: 1,522; sh: 138; cpp: 78; pascal: 67; awk: 56; perl: 9
file content (121 lines) | stat: -rw-r--r-- 3,274 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# This file is part of the Astrometry.net suite.
# Licensed under a 3-clause BSD style license - see LICENSE
from __future__ import print_function
import os
import os.path
import re
import socket
import sys

from ftplib import FTP


class crawler(object):
    dirstack = [ '' ]
    currentdir = None

    ffiles = None
    fdirs = None
    fnomatches = None

    def __init__(self):
        self.ffiles = open('files', 'ab')
        self.fdirs = open('dirs', 'ab')
        self.fnomatches = open('nomatch', 'ab')

    ire = re.compile(r'^(?P<dir>.)' + # 'd' or '-'
                     r'.{9}' + # mode
                     r'\s*\w*' + # space, link count
                     r'\s*\w*' + # space, owner
                     r'\s*\w*' + # space, group
                     r'\s*(?P<size>\d*)' + # filesize
                     r'\s*(?P<month>\w*)' + # space, month
                     r'\s*(?P<day>\w*)' + # space, day
                     r'\s*(?P<time>[\w:]*)' + # space, time
                     r'\s*(?P<name>[\w.~_+-]*)' + # space, name
                     r'$')

    def close(self):
        self.ffiles.close()
        self.fdirs.close()
        self.fnomatches.close()

    def set_dirstack(self, stack):
        self.dirstack = stack

    def write_stack(self):
        f = open('dirstack.tmp', 'wb')
        for d in self.dirstack:
            f.write(d + '\n')
        f.close()
        os.rename('dirstack.tmp', 'dirstack')

    def add_item(self, s):
        print('item', s)
        m = self.ire.match(s)
        if not m:
            print('no match')
            self.fnomatches.write(self.currentdir + ' ' + s + '\n')
            self.fnomatches.flush()
            return

        d = m.group('dir')
        name = m.group('name')
        path = self.currentdir + '/' + name
        
        if d == 'd':
            self.dirstack.append(path)
            self.fdirs.write(path + '\n')
            self.fdirs.flush()
        else:
            self.ffiles.write(path + '\n')
            self.ffiles.flush()



if __name__ == '__main__':
    ftp = None

    crawl = crawler()

    if os.path.exists('dirstack'):
        f = open('dirstack', 'rb')
        stack = f.read().strip().split('\n')
        #stack = []
        #for ln in f:
        #    stack.append(ln)
        print('Dirstack:')
        for d in stack:
            print(d)
        print('(end dirstack)')
        crawl.set_dirstack(stack)

    nrequests = 0
    socket.setdefaulttimeout(10)

    while len(crawl.dirstack):
        if not ftp: # or not (nrequests % 100):
            if ftp:
                print('closing connection.')
                ftp.quit()
            print('opening connection')
            sys.stdout.flush()
            ftp = FTP('galex.stsci.edu')
            ftp.login('anonymous', 'dstn@cs.toronto.edu')
            #ftp.set_debuglevel(2)

        d = crawl.dirstack.pop()
        crawl.currentdir = d
        print('listing "%s"' % d)
        sys.stdout.flush()
        try:
            ftp.dir(d, crawl.add_item)
            crawl.write_stack()
            nrequests += 1
        except Exception as e:
            print('caught exception:', e)
            sys.stdout.flush()
            crawl.dirstack.append(d)
            ftp.close()
            ftp = None