File: LinkChecker_rm_false_errors.py

package info (click to toggle)
abinit 9.10.4-3
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 518,712 kB
  • sloc: xml: 877,568; f90: 577,240; python: 80,760; perl: 7,019; ansic: 4,585; sh: 1,925; javascript: 601; fortran: 557; cpp: 454; objc: 323; makefile: 77; csh: 42; pascal: 31
file content (375 lines) | stat: -rwxr-xr-x 10,231 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
#!/usr/bin/env python

#
# Copyright (C) 2010-2022 ABINIT Group (Jean-Michel Beuken)
#
# This file is part of the ABINIT software package. For license information,
# please see the COPYING file in the top-level directory of the ABINIT source
# distribution.
#

#
# 'false' errors elimination in the 'linkchecker_ext.log' generated by
# a script located on ref slave ( abiref:~buildbot/bin/LinkChecker.sh )
#

from __future__ import unicode_literals, division, print_function, absolute_import

import sys
import os
import re
from lxml import etree
import requests
import argparse

# ---------------------------------------------------------------------------- #

#all_tags = [ 'url','name','parent','realurl','extern','dlsize','checktime','level','infos','valid' ]
#printable_tags = [ 'url', 'name', 'parent', 'realurl', 'valid' ]

version_info = (1, 0, 0)
version = '.'.join(str(c) for c in version_info)

debug = False
server = "http://localhost:8000"

#
# Functions
#

def rm_server(keyword):
  if keyword.startswith(server):
     return keyword[len(server):]

def Checking_on_url_to_skip(e, u, v):
  global url_to_skip
  for ui in url_to_skip :
     if e == "1" and ui == u and  ( v == "syntax OK" or v == "filtered" ) :
         #print("url_to_skip")
         return True
  return False


def Checking_on_url_string_to_skip(e, u):
  global url_string_to_skip
  for s in url_string_to_skip :
     if e == "1" and u.find( s ) >= 0 :
         #print(s,"url_string_to_skip")
         return True
  return False


def Checking_on_no_error_list(url, info, valid):
  global no_error_list
  #print("Enter no_error_list... %s,%s,%s" % (url.text,info,valid))
  for no_error in no_error_list:

    norc = True  # True : we consider that this is not an error

    url_rc = None
    try:
       url_rc = no_error['url'].search(url.text)
    except:
       print("-- no URL NAME --")
    norc = norc and ( url_rc != None )

    info_rc = None
    if info is not None:
      try:
       info_rc = no_error['info'].search(info.text)
       norc = norc and ( info_rc != None )
      except:
       pass
    else:
       info_rc = True

    valid_rc = None
    valid_rc = no_error['valid'].search(valid)
    norc = norc and ( valid_rc != None )

    if url_rc != None and info_rc != None and valid_rc != None:
        return True # in the exception list -> next xml entry
  return False # may be a error


def Checking_on_false_error_list(url, valid, parent, cnx):
  global false_error_list

  try:
     URL=url.text
  except:
     return False

  for false_error in false_error_list:

    url_rc = None
    try:
       url_rc = false_error['url'].search(URL)
    except:
       url_rc = True

    valid_rc = None
    try:
       valid_rc = false_error['valid'].search(valid)
    except:
       valid_rc = True

    parent_rc = None
    try:
       parent_rc = false_error['parent'].search(parent)
    except:
       parent_rc = True

    if cnx == 0:
       if url_rc != None and valid_rc != None and parent_rc != None:
          return True # is a false error
    else:
       if url_rc != None and valid_rc != None and parent_rc != None and cnx == 404:
          #print("found a false error...")
          return True # is a false error

  return False # may be a error


def Checking_on_warning_error_list(valid) :
  global warning_list

  for warning_error in warning_list:
    
    v_rc = None
    v_rc = warning_error['valid'].search(valid) 
    if v_rc != None :
       return True

  return False

# ---------------------------------------------------------------------------- #

# Types of error :
#    Error: 401 Unauthorized
#    Error: 403 Forbidden
#    Error: 404 Not Found
#    Error: 504 Gateway Time-out
#    Error: 502 Bad Gateway
#    Error: ReadTimeout:
#    ConnectionError: ('Connection aborted.

warning_list = [
    { 'valid': re.compile('^ReadTimeout')
    },
]

no_error_list = [
    { 'url'  : re.compile('(doi|aps|stacks.iop).org'),
      'info' : re.compile('^Redirected'),
      'valid': re.compile('^403 Forbidden')
    },
    { 'url'  : re.compile('jstor.org'),
      'valid': re.compile('^403 Unauthorized')
    },
]

false_error_list = [
    { 'url'  : re.compile('(dx.doi.orgg|en.wwikipedia.org)'),
      'valid': re.compile('^ConnectionError'),
    },
    { 'url'  : re.compile('10.1102/physrevb.27.4760'),
      'cnx'  : 404
    },
    { 'url'  : re.compile('10.1103/physrevb.87.085323'),
      'cnx'  : 404
    },
    { 'url'   : re.compile('abiconfigg'),
      'parent': re.compile('testlink/'),
      'cnx'   : 404
    },
    { 'url'   : re.compile('FAKE_URL'),
      'parent': re.compile('testlink/')
    },
]

url_to_skip = [
    "https://github.com/abinit/abiconfig",
    "https://github.com/abinit/abiflows",
    "https://github.com/abinit/abiconda",
    "https://github.com/abinit/abiconfig",
    "https://github.com/abinit/abitutorials",
    "https://github.com/abinit/abipy",
    "https://github.com/abinit/abiout",
    "https://github.com/abinit/abinit/",
    "https://github.com/abinit/abinit",
    "https://github.com/abinit/",
    "https://github.com/abinit",
    "https://www.facebook.com/abinit.org",
    "https://fonts.gstatic.com"
]

url_string_to_skip = [
    "cdn.jsdelivr.net",
    "cdn.embedly.com",
    "cdn.plot.ly",
    "maxcdn.bootstrapcdn.com",
    "facebook.com/abinit",
    "github.com/abinit/abinit/edit",
    "markdown-here/wiki/Markdown",
    "nschloe/betterbib",
    "github.com/mitya57",
    "github.com/helderco",
    "github.com/abinit/abinit/commit/",
    "github.com/abinit/abipy_assets",
    "github.com/abinit/abinit/tree",
    "github.com/abinit/abinit/blob",

]

# ---------------------------------------------------------------------------- #

#
# Main program
#
def main(filename,home_dir=""):
  from os.path import join as pj

  # Check if we are in the top of the ABINIT source tree
  my_name = os.path.basename(__file__) + ".main"
  if ( not os.path.exists(pj(home_dir,"configure.ac")) or
       not os.path.exists(pj(home_dir, "src/98_main/abinit.F90")) ):
    print("%s: You must be in the top of an ABINIT source tree." % my_name)
    print("%s: Aborting now." % my_name)
    sys.exit(1)

  #
  tree = etree.parse(filename)
  
  rc=0 # true error counter 
  frc=0 # false error counter
  wrc=0 # warning error counter

  urls=set()

  for child in tree.xpath("/linkchecker/urldata"):
  
    url    = child.find('url')
    parent = child.find('parent')
    URL    = url.text
    info   = child.find('infos/info')
    extern = child.find('extern')
    valid  = child.find('valid').get("result")

    ### check for duplicate entry except for FAKE_URL

    try:
      if not ("FAKE_URL" in URL) :
          if URL in urls :
             continue
          else: 
             urls.add(URL)
      #else:
      #    if not ("index.html" in parent.text) :
      #       continue
    except:
      pass

    ### precleaning ###

    v = re.compile("^200")   # status "200" or "200 OK"
    if v.search(valid):
       continue

    if url.text[0:6] == "mailto" and valid == "Valid mail address syntax" :
       continue

    if Checking_on_url_to_skip( extern.text, url.text, valid ):
       continue

    if Checking_on_url_string_to_skip( extern.text, url.text ):
       continue

    ### fine cleaning ###

    if Checking_on_no_error_list(url, info, valid) :
       continue

    ### last chance to know if it's not a error ###
    ### access denied  then checks with 'curl'  ###

    Check_connection = False
    cnx_status = 0
    if valid == "syntax OK" :
        Check_connection = True
        if debug : print("check cnx : ",url.text)
        try: 
           request = requests.get(url.text, headers={"content-type":"text"}, timeout=(2,2) )
           cnx_status = request.status_code
        except (requests.Timeout, requests.ConnectionError, KeyError) as e:
           if debug : print('failed to connect to website ({})'.format(e))
           continue
        if cnx_status == 200 :  # OK
           continue
        if cnx_status == 403 :  # cnx ok but Forbidden for robot
           continue
    
    # check if the error is a "false" error
    if Checking_on_false_error_list(url=url, valid=valid, parent=parent.text, cnx=cnx_status) :
        frc += 1
        continue

    # check if it's a warning
    if Checking_on_warning_error_list(valid=valid) :
        wrc += 1

    # found a true error... : reporting on bb
    rc += 1
    name=child.find('name')
    realurl=child.find('realurl')
    try:
       print("{0:12} {1}".format('URL',url.text))
    except:
       print("{0:12} {1}".format('URL',' ** NO URL **'))
    try:
       print("{0:12} {1}".format('Name',name.text))
    except:
       print("{0:12} {1}".format('Name','** NO NAME **'))
    print("{0:12} {1}, line {2}".format('Parent URL',rm_server(parent.text),parent.get('line')))
    try:
       print("{0:12} {1}".format('Infos',info.text))
    except:
       pass 
    print("{0:12} {1}".format('Real URL',realurl.text))
    print("{0:12} {1}".format('Result',valid))
    if Check_connection : 
          print("{0:12} {1}".format('Status CNX',request.status_code))
    print('---------------------------')

  print('false errors found : ',frc,' ( must be : 7 )')

  if rc != 0:
     if rc - wrc != 0:
        return 2 # FAILED
     else:
        print('warning errors found : ',wrc, ' [probably, these errors are transient...]')
        return 1 # WARNING

  return 0 # SUCCESS

# ---------------------------------------------------------------------------- #

if __name__ == "__main__":

  parser = argparse.ArgumentParser(description='Remove false errors')
  parser.add_argument('--verbose', '-v', action='count',
                      help='increase verbosity. Specify multiple times')
  parser.add_argument('--version', action='version',
                      version='%(prog)s {}'.format(version),
                      help='show the version number and exit')
  parser.add_argument('filename', help='input file (xml format)'),
  parser.add_argument('home_dir', nargs='?', default=os.getcwd())

  args = parser.parse_args()

  filename = args.filename
  home_dir = args.home_dir
  #print(filename, home_dir)

  exit_status = main(filename,home_dir)
  sys.exit(exit_status)