File: merge.py

package info (click to toggle)
pytorch-audio 2.6.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 10,696 kB
  • sloc: python: 61,274; cpp: 10,031; sh: 128; ansic: 70; makefile: 34
file content (80 lines) | stat: -rw-r--r-- 2,241 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import argparse
import os

parser = argparse.ArgumentParser(description="Merge labels")
parser.add_argument(
    "--dataset",
    type=str,
    required=True,
    help="Specify the dataset used in the experiment",
)
parser.add_argument(
    "--subset",
    type=str,
    required=True,
    help="Specify the subset of the dataset used in the experiment",
)
parser.add_argument(
    "--root-dir",
    type=str,
    required=True,
    help="Directory of saved mouth patches or embeddings",
)
parser.add_argument(
    "--groups",
    type=int,
    required=True,
    help="Number of threads for parallel processing",
)
parser.add_argument(
    "--seg-duration",
    type=int,
    default=16,
    help="Length of the segments",
)
args = parser.parse_args()

dataset = args.dataset
subset = args.subset
seg_duration = args.seg_duration

# Check that there is more than one group
assert args.groups > 1, "There is no need to use this script for merging when --groups is 1."

# Create the filename template for label files
label_template = os.path.join(
    args.root_dir,
    "labels",
    f"{dataset}_{subset}_transcript_lengths_seg{seg_duration}s.{args.groups}",
)

lines = []
for job_index in range(args.groups):
    label_filename = f"{label_template}.{job_index}.csv"
    assert os.path.exists(label_filename), f"{label_filename} does not exist."

    with open(label_filename, "r") as file:
        lines.extend(file.read().splitlines())

# Write the merged labels to a new file
dst_label_filename = os.path.join(
    args.root_dir,
    "labels",
    f"{dataset}_{subset}_transcript_lengths_seg{seg_duration}s.csv",
)

with open(dst_label_filename, "w") as file:
    file.write("\n".join(lines))

# Print the number of files and total duration in hours
total_duration = sum(int(line.split(",")[2]) for line in lines) / 3600.0 / 25.0  # simplified from /3600./25.
print(f"The completed set has {len(lines)} files with a total of {total_duration:.2f} hours.")

# Remove the label files for each job index
print("** Remove the temporary label files **")
for job_index in range(args.groups):
    label_filename = f"{label_template}.{job_index}.csv"
    if os.path.exists(label_filename):
        os.remove(label_filename)

print("** Finish **")