File: ia_delete.py

package info (click to toggle)
python-internetarchive 3.3.0-2~deb12u1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,096 kB
  • sloc: python: 6,276; xml: 180; makefile: 180
file content (147 lines) | stat: -rw-r--r-- 5,652 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#
# The internetarchive module is a Python/CLI interface to Archive.org.
#
# Copyright (C) 2012-2019 Internet Archive
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""Delete files from Archive.org.

usage:
    ia delete <identifier> <file>... [options]...
    ia delete <identifier> [options]...
    ia delete --help

options:
    -h, --help
    -q, --quiet                  Print status to stdout.
    -c, --cascade                Delete all files associated with the specified file,
                                 including upstream derivatives and the original.
                                 file.
    -H, --header=<key:value>...  S3 HTTP headers to send with your request.
    -a, --all                    Delete all files in the given item (Note: Some files,
                                 such as <identifier>_meta.xml and <identifier>_files.xml,
                                 cannot be deleted)
    -d, --dry-run                Output files to be deleted to stdout, but don't actually
                                 delete.
    -g, --glob=<pattern>         Only delete files matching the given pattern.
    -f, --format=<format>...     Only only delete files matching the specified format(s).
    -R, --retries=<i>            Number of times to retry if S3 returns a 503 SlowDown
                                 error [default: 2].
    --no-backup                  Turn off archive.org backups. Clobbered files
                                 will not be saved to history/files/$key.~N~
                                 [default: True].
"""
import sys

import requests.exceptions
from docopt import docopt, printable_usage
from schema import And, Or, Schema, SchemaError, Use  # type: ignore[import]

from internetarchive import ArchiveSession
from internetarchive.cli.argparser import convert_str_list_to_unicode, get_args_dict
from internetarchive.utils import get_s3_xml_text


def main(argv, session: ArchiveSession) -> None:
    args = docopt(__doc__, argv=argv)

    # Validation error messages.
    invalid_id_msg = ('<identifier> should be between 3 and 80 characters in length, and '
                      'can only contain alphanumeric characters, underscores ( _ ), or '
                      'dashes ( - )')

    # Validate args.
    s = Schema({
        str: Use(bool),
        '<file>': list,
        '--format': list,
        '--header': Or(None, And(Use(get_args_dict), dict),
                       error='--header must be formatted as --header="key:value"'),
        '--glob': list,
        'delete': bool,
        '--retries': Use(lambda i: int(i[0])),
        '<identifier>': str,
    })
    try:
        args = s.validate(args)
    except SchemaError as exc:
        print(f'{exc}\n{printable_usage(__doc__)}', file=sys.stderr)
        sys.exit(1)

    verbose = True if not args['--quiet'] else False
    item = session.get_item(args['<identifier>'])
    if not item.exists:
        print('{0}: skipping, item does\'t exist.', file=sys.stderr)

    # Files that cannot be deleted via S3.
    no_delete = ['_meta.xml', '_files.xml', '_meta.sqlite']

    # Add keep-old-version by default.
    if not args['--header'].get('x-archive-keep-old-version') and not args['--no-backup']:
        args['--header']['x-archive-keep-old-version'] = '1'

    if verbose:
        print(f'Deleting files from {item.identifier}', file=sys.stderr)

    if args['--all']:
        files = list(item.get_files())
        args['--cascade'] = True
    elif args['--glob']:
        files = item.get_files(glob_pattern=args['--glob'])
    elif args['--format']:
        files = item.get_files(formats=args['--format'])
    else:
        fnames = []
        if args['<file>'] == ['-']:
            fnames = [f.strip() for f in sys.stdin]
        else:
            fnames = [f.strip() for f in args['<file>']]

        files = list(item.get_files(fnames))

    if not files:
        print(' warning: no files found, nothing deleted.', file=sys.stderr)
        sys.exit(1)

    errors = False

    for f in files:
        if not f:
            if verbose:
                print(f' error: "{f.name}" does not exist', file=sys.stderr)
            errors = True
        if any(f.name.endswith(s) for s in no_delete):
            continue
        if args['--dry-run']:
            print(f' will delete: {item.identifier}/{f.name}', file=sys.stderr)
            continue
        try:
            resp = f.delete(verbose=verbose,
                            cascade_delete=args['--cascade'],
                            headers=args['--header'],
                            retries=args['--retries'])
        except requests.exceptions.RetryError as e:
            print(f' error: max retries exceeded for {f.name}', file=sys.stderr)
            errors = True
            continue

        if resp.status_code != 204:
            errors = True
            msg = get_s3_xml_text(resp.content)
            print(f' error: {msg} ({resp.status_code})', file=sys.stderr)
            continue

    if errors is True:
        sys.exit(1)