File: multi_threading_file_compression.py

package info (click to toggle)
python-pymzml 2.5.2%2Brepack1-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 27,792 kB
  • sloc: python: 6,495; pascal: 341; makefile: 233; sh: 30
file content (67 lines) | stat: -rwxr-xr-x 1,810 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python3

import sys
import os
from pymzml.utils.utils import index_gzip
import pymzml
import glob
import multiprocessing


def main(folder, num_cpus=1):
    """
    Creates indexed gzip mzML files from all mzMLs files in the given folder
    using a given number of threads.

    Usage:
        python multi_threading_file_compression.py <folder> <threads>

    Note:
        If the number of threads is larger than the number of actual possible
        threads, all possible threads will be used.

    """
    max_cpus = multiprocessing.cpu_count()
    if int(num_cpus) > max_cpus:
        num_cpus = max_cpus
    else:
        num_cpus = int(num_cpus)
    mzml_job_list = []
    for mzml_path in glob.glob(os.path.join(folder, "*.mzML")):
        out_path = "{0}.gz".format(mzml_path)
        if os.path.exists(out_path):
            print("Skipping: {0}".format(mzml_path))
            continue
        mzml_job_list.append((mzml_path, out_path))
    print(
        "Compressing {0} mzML files using {1} threads".format(
            len(mzml_job_list), num_cpus
        )
    )
    mp_pool = multiprocessing.Pool(num_cpus)
    results = mp_pool.starmap(compress_file, mzml_job_list)
    mp_pool.close()
    print("Done")
    return


def compress_file(file_path, out_path):
    print("Working on file {0}".format(file_path))
    with open(file_path) as fin:
        fin.seek(0, 2)
        max_offset_len = fin.tell()
        max_spec_no = pymzml.run.Reader(file_path).get_spectrum_count() + 10

    index_gzip(
        file_path, out_path, max_idx=max_spec_no, idx_len=len(str(max_offset_len))
    )
    print("Wrote file {0}".format(out_path))
    return


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print(main.__doc__)
        exit()
    else:
        main(*sys.argv[1:])