File: www.postcrossing.com

package info (click to toggle)
python-protego 0.6.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 30,048 kB
  • sloc: python: 1,492; perl: 190; cpp: 33; sh: 4; makefile: 3
file content (173 lines) | stat: -rw-r--r-- 2,831 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
###
# postcrossing.com robots.txt file
#
# NOTE: Entries in robots.txt don't seem to inherit from '*'. Or not all bots know how to anyway, hence the repetition
###

User-Agent: *
# only the right user can open it, so stop doing 403's
Disallow: /travelingpostcard/*
Disallow: /user/*/traveling
Disallow: /user/*/gallery/popular
Disallow: /user/*/map
Allow: /



#
# Don't need the extra load
#
User-agent: Googlebot-Image
# only the right user can open it, so stop doing 403's
Disallow: /travelingpostcard/*
Disallow: /user/*/traveling
Disallow: /user/*/gallery/popular
Disallow: /user/*/map
# extra
Disallow: /postcards/*
Disallow: /user/*/gallery
Disallow: /gallery
Disallow: /country/*
Allow: /


#
# AdSense crawler
#
User-agent: Mediapartners-Google
Allow: /



#
# Wayback machine: don't overdue it
#
User-agent: ia_archiver
Disallow: /user/*
Disallow: /postcards/*
Disallow: /gallery
Allow: /


#
# Browser pipelining/pre-fetching is not always a good idea
#
User-agent: Fasterfox
Disallow: /


#
# Unidentified misbehaving bot
#
User-agent: bhc.collectionBot
Disallow: /


#
# Please respect our Terms of Service: spiders/scrappers are only allowed with explicit permission
#
User-agent: Scrapy
Disallow: /
User-agent: scrapybot
Disallow: /


#
# below here is from wikipedia's robots.txt
#


# Some bots are known to be trouble, particularly those designed to copy
# entire sites. Please obey robots.txt.
User-agent: sitecheck.internetseer.com
Disallow: /

User-agent: Zealbot
Disallow: /

User-agent: MSIECrawler
Disallow: /

User-agent: SiteSnagger
Disallow: /

User-agent: WebStripper
Disallow: /

User-agent: WebCopier
Disallow: /

User-agent: Fetch
Disallow: /

User-agent: Offline Explorer
Disallow: /

User-agent: Teleport
Disallow: /

User-agent: TeleportPro
Disallow: /

User-agent: WebZIP
Disallow: /

User-agent: linko
Disallow: /

User-agent: HTTrack
Disallow: /

User-agent: Microsoft.URL.Control
Disallow: /

User-agent: Xenu
Disallow: /

User-agent: larbin
Disallow: /

User-agent: libwww
Disallow: /

User-agent: ZyBORG
Disallow: /

User-agent: Download Ninja
Disallow: /

# Misbehaving: requests much too fast:
User-agent: fast
Disallow: /

#
# Sorry, wget in its recursive mode is a frequent problem.
# Please read the man page and use it properly; there is a
# --wait option you can use to set the delay between hits,
# for instance.
#
User-agent: wget
Disallow: /

#
# The 'grub' distributed client has been *very* poorly behaved.
#
User-agent: grub-client
Disallow: /

#
# Doesn't follow robots.txt anyway, but...
#
User-agent: k2spider
Disallow: /

#
# Hits many times per second, not acceptable
# http://www.nameprotect.com/botinfo.html
User-agent: NPBot
Disallow: /

# A capture bot, downloads gazillions of pages with no public benefit
# http://www.webreaper.net/
User-agent: WebReaper
Disallow: /