ast-lrl-speech/utils.py at main · McGill-NLP/ast-lrl-speech · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from mapping import FLEURS_LANGUAGE_CODES
import json
import matching
import metrics
from typing import Tuple

def rename_json_property(file=None, old_name=None, new_name=None):
    with open(file, 'r', encoding="utf-8") as f:
        data = json.load(f)

    # If the JSON is a list of dictionaries, iterate over each element.
    if isinstance(data, list):
        for item in data:
            if isinstance(item, dict) and old_name in item:
                item[new_name] = item.pop(old_name)
    # If it's just a single dictionary, handle it directly.
    elif isinstance(data, dict):
        if old_name in data:
            data[new_name] = data.pop(old_name)
        else:
            raise KeyError(f"Key '{old_name}' not found in the JSON data.")
    else:
        raise TypeError("Unexpected JSON structure. Expected a list or a dictionary.")

    # Write the updated data back to the file
    with open(file, 'w', encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
        print(f"Renamed for file : {file}")


def remove_prefix_from_wav_code(input_file, output_file, prefix="test/"):
    """
    Loads JSON data from input_file, removes the specified prefix from each 'wav_code' entry,
    and writes the updated data to output_file.

    Parameters:
        input_file (str): Path to the JSON file containing the data.
        output_file (str): Path where the updated JSON will be saved.
        prefix (str): The prefix to remove from 'wav_code' entries (default is "test/").
    """
    # Load JSON data from the file
    with open(input_file, 'r', encoding='utf-8') as file:
        data = json.load(file)

    # Iterate through each entry and remove the prefix if it exists
    for entry in data:
        if entry.get("file_name", "").startswith(prefix):
            entry["file_name"] = entry["file_name"][len(prefix):]

    # Write the updated data to the output file
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)


def average_words_per_entry(json_path: str) -> Tuple[int, int, float]:
    total_words = 0
    total_entries = 0

    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        for entry in data:
            transcript = entry.get("gold_transcript", "").strip()
            if transcript:
                total_entries += 1
                total_words += len(transcript.split())

    average = (total_words / total_entries) if total_entries else 0.0
    print(json_path, average)
    return total_entries, total_words, average


# codes_model = [
#     "hi_in",
#     "pa_in",
#     "ta_in",
#     "te_in",
#     "ml_in",
#     # "sw_ke",
#     # "ha_ng",
#     # "yo_ng",
#     # "ig_ng",
#     # "lg_ug",
#     # "fr_fr",
#     ]


# codes_lang = [
#     "hi_in",
#     "pa_in",
#     "ta_in",
#     "te_in",
#     "ml_in",
#     # "sw_ke",
#     # "ha_ng",
#     # "yo_ng",
#     # "ig_ng",
#     # "lg_ug",
#     # "fr_fr",
#     ]

codes = [
    "hi_in",
    "pa_in",
    "ta_in",
    "te_in",
    "ml_in",
    "sw_ke",
    "ha_ng",
    "yo_ng",
    "ig_ng",
    "lg_ug",
    # "fr_fr",
    ]

for code in codes:

    file = f"asr_corrector/prediction_plus_json/{code}.json"
    # remove_prefix_from_wav_code(file, file)
    # rename_json_property(file, "file_id", "file_name")
    # matching.gold_codes_matching(file, f"fleurs_lang_info/{code}_fleurs_info.csv", file)
    # matching.gold_text_matching(file, "fleurs_lang_info/en_translations.csv", file, "translation") # Translations
    metrics.compute_bleu_score(file, code, "nllb")
    # matching.gold_text_matching(file, f"fleurs_lang_info/{code}_fleurs_info.csv", file, "transcription") # Transcripts
    metrics.simple_wer(file, code)


        # print(evaluation.detailed_wer(file))
        # average_words_per_entry(file)