From fc927299e5d0b3f6a618948e738762e255a7f403 Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Thu, 16 Jan 2025 12:12:29 +0900 Subject: [PATCH 01/23] add conversion script --- .../ja/ja_warp_pdf/.gitignore | 13 ++ .../ja/ja_warp_pdf/.python-version | 1 + .../llm-jp-corpus-v4/ja/ja_warp_pdf/README.md | 21 +++ .../ja/ja_warp_pdf/examples/example.jsonl | 5 + .../ja/ja_warp_pdf/pyproject.toml | 19 +++ .../ja/ja_warp_pdf/requirements-dev.lock | 100 ++++++++++++ .../ja/ja_warp_pdf/requirements.lock | 100 ++++++++++++ .../ja/ja_warp_pdf/scripts/convert.py | 149 ++++++++++++++++++ .../ja/ja_warp_pdf/scripts/test_convert.py | 74 +++++++++ 9 files changed, 482 insertions(+) create mode 100644 corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/.gitignore create mode 100644 corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/.python-version create mode 100644 corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/README.md create mode 100644 corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/examples/example.jsonl create mode 100644 corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/pyproject.toml create mode 100644 corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/requirements-dev.lock create mode 100644 corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/requirements.lock create mode 100644 corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py create mode 100644 corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/test_convert.py diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/.gitignore b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/.gitignore new file mode 100644 index 00000000..b089da4a --- /dev/null +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/.gitignore @@ -0,0 +1,13 @@ +# python generated files +__pycache__/ +*.py[oc] +build/ +dist/ +wheels/ +*.egg-info + +# venv +.venv + +# bunkai +bunkai_model/ diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/.python-version b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/.python-version new file mode 100644 index 00000000..d9506ceb --- /dev/null +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/.python-version @@ -0,0 +1 @@ +3.12.5 diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/README.md b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/README.md new file mode 100644 index 00000000..f7800e0e --- /dev/null +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/README.md @@ -0,0 +1,21 @@ +# ja-warp-pdf + +Preprocess text extracted from PDFs provided by WARP. + +## Environment + +- Python 3.12.5 + +## Installation + +Use [rye](https://rye.astral.sh/) to install the dependencies. + +```bash +rye sync +``` + +and then download the Bunkai sentence splitter model. + +```bash +rye run bunkai --model bunkai_model --setup +``` diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/examples/example.jsonl b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/examples/example.jsonl new file mode 100644 index 00000000..bdf53e5e --- /dev/null +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/examples/example.jsonl @@ -0,0 +1,5 @@ +{"docId": "example0", "text": "本勉強会では、自然言語処理および計算機システムの研究者が集まり大規模言語モデルの研究開発について定期的に情報共有を行っています。"} +{"docId": "example1", "text": "本勉強会では、自然言語処理および\n計算機システムの研究者が集まり\n大規模言語モデルの研究開発について\n定期的に情報共有を行っています。"} +{"docId": "example2", "text": "本勉強会では、自然言\n語処理および計算機シ\nステムの研究者が集ま\nり大規模言語モデルの\n研究開発について定期\n的に情報共有を行って\nいます。"} +{"docId": "example3", "text": "本\n勉\n強\n会\nで\nは\n、\n自\n然\n言\n語\n処\n理\nお\nよ\nび\n計\n算\n機\nシ\nス\nテ\nム\nの\n研\n究\n者\nが\n集\nま\nり\n大\n規\n模\n言\n語\nモ\nデ\nル\nの\n研\n究\n開\n発\nに\nつ\nい\nて\n定\n期\n的\nに\n情\n報\n共\n有\nを\n行\nっ\nて\nい\nま\nす\n。"} +{"docId": "example4", "text": "本 勉 強 会 で は 、 自 然 言 語 処 理 お よ び 計 算 機 シ ス テ ム の 研 究 者 が 集 ま り 大 規 模 言 語 モ デ ル の 研 究 開 発 に つ い て 定 期 的 に 情 報 共 有 を 行 っ て い ま す 。"} diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/pyproject.toml b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/pyproject.toml new file mode 100644 index 00000000..75ae58a6 --- /dev/null +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/pyproject.toml @@ -0,0 +1,19 @@ +[project] +name = "ja-warp-pdf" +version = "0.1.0" +description = "Add your description here" +authors = [ + { name = "Hirokazu Kiyomaru", email = "h.kiyomaru@gmail.com" } +] +dependencies = [ + "bunkai[lb]>=1.5.7", + "transformers==4.33.3", + "pytest>=8.3.4", +] +readme = "README.md" +requires-python = ">= 3.8" + +[tool.rye] +managed = true +virtual = true +dev-dependencies = [] diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/requirements-dev.lock b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/requirements-dev.lock new file mode 100644 index 00000000..9c9e914c --- /dev/null +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/requirements-dev.lock @@ -0,0 +1,100 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: false +# with-sources: false +# generate-hashes: false +# universal: false + +bunkai==1.5.7 +certifi==2024.12.14 + # via requests +charset-normalizer==3.4.1 + # via requests +dataclasses-json==0.6.7 + # via bunkai +emoji==2.14.0 + # via bunkai +emojis==0.7.0 + # via bunkai +filelock==3.16.1 + # via huggingface-hub + # via torch + # via transformers +fsspec==2024.12.0 + # via huggingface-hub + # via torch +huggingface-hub==0.27.1 + # via transformers +idna==3.10 + # via requests +iniconfig==2.0.0 + # via pytest +janome==0.5.0 + # via bunkai +jinja2==3.1.5 + # via torch +markupsafe==3.0.2 + # via jinja2 +marshmallow==3.25.1 + # via dataclasses-json +more-itertools==10.6.0 + # via bunkai +mpmath==1.3.0 + # via sympy +mypy-extensions==1.0.0 + # via typing-inspect +networkx==3.4.2 + # via torch +numpy==2.2.1 + # via bunkai + # via transformers +packaging==24.2 + # via huggingface-hub + # via marshmallow + # via pytest + # via transformers +pluggy==1.5.0 + # via pytest +pytest==8.3.4 +pyyaml==6.0.2 + # via huggingface-hub + # via transformers +regex==2024.11.6 + # via bunkai + # via transformers +requests==2.32.3 + # via bunkai + # via huggingface-hub + # via transformers +safetensors==0.5.2 + # via transformers +setuptools==75.8.0 + # via torch +spans==1.1.1 + # via bunkai +sympy==1.13.1 + # via torch +tokenizers==0.13.3 + # via transformers +toml==0.10.2 + # via bunkai +torch==2.5.1 + # via bunkai +tqdm==4.67.1 + # via bunkai + # via huggingface-hub + # via transformers +transformers==4.33.3 + # via bunkai +typing-extensions==4.12.2 + # via huggingface-hub + # via torch + # via typing-inspect +typing-inspect==0.9.0 + # via dataclasses-json +urllib3==2.3.0 + # via requests diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/requirements.lock b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/requirements.lock new file mode 100644 index 00000000..9c9e914c --- /dev/null +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/requirements.lock @@ -0,0 +1,100 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: false +# with-sources: false +# generate-hashes: false +# universal: false + +bunkai==1.5.7 +certifi==2024.12.14 + # via requests +charset-normalizer==3.4.1 + # via requests +dataclasses-json==0.6.7 + # via bunkai +emoji==2.14.0 + # via bunkai +emojis==0.7.0 + # via bunkai +filelock==3.16.1 + # via huggingface-hub + # via torch + # via transformers +fsspec==2024.12.0 + # via huggingface-hub + # via torch +huggingface-hub==0.27.1 + # via transformers +idna==3.10 + # via requests +iniconfig==2.0.0 + # via pytest +janome==0.5.0 + # via bunkai +jinja2==3.1.5 + # via torch +markupsafe==3.0.2 + # via jinja2 +marshmallow==3.25.1 + # via dataclasses-json +more-itertools==10.6.0 + # via bunkai +mpmath==1.3.0 + # via sympy +mypy-extensions==1.0.0 + # via typing-inspect +networkx==3.4.2 + # via torch +numpy==2.2.1 + # via bunkai + # via transformers +packaging==24.2 + # via huggingface-hub + # via marshmallow + # via pytest + # via transformers +pluggy==1.5.0 + # via pytest +pytest==8.3.4 +pyyaml==6.0.2 + # via huggingface-hub + # via transformers +regex==2024.11.6 + # via bunkai + # via transformers +requests==2.32.3 + # via bunkai + # via huggingface-hub + # via transformers +safetensors==0.5.2 + # via transformers +setuptools==75.8.0 + # via torch +spans==1.1.1 + # via bunkai +sympy==1.13.1 + # via torch +tokenizers==0.13.3 + # via transformers +toml==0.10.2 + # via bunkai +torch==2.5.1 + # via bunkai +tqdm==4.67.1 + # via bunkai + # via huggingface-hub + # via transformers +transformers==4.33.3 + # via bunkai +typing-extensions==4.12.2 + # via huggingface-hub + # via torch + # via typing-inspect +typing-inspect==0.9.0 + # via dataclasses-json +urllib3==2.3.0 + # via requests diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py new file mode 100644 index 00000000..6b6ae61c --- /dev/null +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py @@ -0,0 +1,149 @@ +"""Remove intra-sentence line breaks from text.""" + +import argparse +import json +import logging +import pathlib + +import bunkai +import tqdm + +logger = logging.getLogger(__name__) + +root = pathlib.Path(__file__).parent.parent +model_path = root / "bunkai_model" +senter = bunkai.Bunkai(path_model=model_path) + + +def split_text_by_period(text: str) -> list[str]: + """Split text by period. + + Args: + text (str): Input text. + + Returns: + list[str]: List of sentences. + """ + if not text: + return [""] + + chunks: list[str] = [] + chunk: str = "" + for char in text: + chunk += char + if char == "。": + chunks.append(chunk) + chunk = "" + if chunk: + chunks.append(chunk) + return chunks + + +def split_text_by_length(text: str, max_length: int = 100) -> list[str]: + """Split text by length. + + Args: + text (str): Input text. + max_length (int, optional): Maximum length of sentence. Defaults to 100. + + Returns: + list[str]: List of sentences. + """ + if not text: + return [""] + + chunks: list[str] = [] + for i in range(0, len(text), max_length): + chunks.append(text[i : i + max_length]) + return chunks + + +def split_text_by_bunkai(text: str) -> list[str]: + """Split text by Bunkai. + + Args: + text (str): Input text. + + Returns: + list[str]: List of sentences. + """ + if not text: + return [""] + return list(senter(text)) + + +def remove_intra_sentence_line_breaks(text: str) -> str: + """Remove intra-sentence line breaks. + + Args: + text (str): Input text. + + Returns: + str: Processed text. + """ + num_leading_newlines = len(text) - len(text.lstrip("\n")) + num_trailing_newlines = len(text) - len(text.rstrip("\n")) + return ( + "\n" * num_leading_newlines + + text.replace("\n", "") + + "\n" * num_trailing_newlines + ) + + +def process_line(line: str) -> str: + """Process line. + + Args: + line (str): Input line. + + Returns: + str: Processed line. + """ + dat = json.loads(line) + + text = dat["text"] + new_text = "" + for chunk in split_text_by_period(text): + # Split large chunk to avoid long processing time + for chunk in split_text_by_length(chunk, max_length=100): + # Skip sentence splitting by bunkai if there is no line break as it is slow + if "\n" not in chunk: + new_text += chunk + continue + + for chunk in split_text_by_bunkai(chunk): + new_text += remove_intra_sentence_line_breaks(chunk) + + assert text.replace("\n", "") == new_text.replace("\n", "") + + dat["text"] = new_text + + return json.dumps(dat, ensure_ascii=False) + "\n" + + +def main() -> None: + """Main function.""" + parser = argparse.ArgumentParser("Remove intra-sentence line breaks from text.") + parser.add_argument("--input-file", type=str, required=True, help="Input file.") + parser.add_argument("--output-file", type=str, required=True, help="Output file.") + args = parser.parse_args() + + with ( + open(args.input_file, "rt", encoding="utf-8") as fin, + open(args.output_file, "wt", encoding="utf-8") as fout, + ): + for i, line in enumerate(tqdm.tqdm(fin), start=1): + try: + line = process_line(line) + except Exception as e: + logger.error(f"Error processing line {i}") + logger.error(e) + fout.write(line) + + +if __name__ == "__main__": + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + level=logging.INFO, + ) + main() diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/test_convert.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/test_convert.py new file mode 100644 index 00000000..2f6109e9 --- /dev/null +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/test_convert.py @@ -0,0 +1,74 @@ +import pytest + +from convert import ( + split_text_by_period, + split_text_by_length, + split_text_by_bunkai, + remove_intra_sentence_line_breaks, +) + + +@pytest.mark.parametrize( + "text, expected", + [ + ("これはペンです。それは本です。", ["これはペンです。", "それは本です。"]), + ("これはペンです。それは本です", ["これはペンです。", "それは本です"]), + ("これはペンです。", ["これはペンです。"]), + ("これはペンです", ["これはペンです"]), + ("", [""]), + ("これはペンです。。。", ["これはペンです。", "。", "。"]), + ("。。。", ["。", "。", "。"]), + ], +) +def test_split_text_by_period(text: str, expected: list[str]) -> None: + assert split_text_by_period(text) == expected + + +@pytest.mark.parametrize( + "text, expected", + [ + ("", [""]), + ("これはペンです。", ["これは", "ペンで", "す。"]), + ("これはペンです", ["これは", "ペンで", "す"]), + ], +) +def test_split_text_by_length(text: str, expected: list[str]) -> None: + assert split_text_by_length(text, 3) == expected + + +@pytest.mark.parametrize( + "text, expected", + [ + ( + "こういう\n日本語の文章は\nよくあります", + ["こういう\n日本語の文章は\nよくあります"], + ), + ( + "改行が文区切りです\nこういう日本語の文章はよくあります", + ["改行が文区切りです\n", "こういう日本語の文章はよくあります"], + ), + ( + "改行が文区切り\nです\nこういう日本語\nの文章はよくあ\nります\n", + ["改行が文区切り\nです\n", "こういう日本語\nの文章はよくあ\nります\n"], + ), + ], +) +def test_split_text_by_bunkai(text: str, expected: list[str]) -> None: + assert split_text_by_bunkai(text) == expected + + +@pytest.mark.parametrize( + "text, expected", + [ + ( + "こういう\n日本語の文章は\nよくあります", + "こういう日本語の文章はよくあります", + ), + ( + "\n\nこういう\n日本語の文章は\nよくあります\n\n", + "\n\nこういう日本語の文章はよくあります\n\n", + ), + ], +) +def test_remove_intra_sentence_line_breaks(text: str, expected: str) -> None: + assert remove_intra_sentence_line_breaks(text) == expected From 89c367efee625231697947041dfb6eb6adc70927 Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Thu, 16 Jan 2025 12:38:13 +0900 Subject: [PATCH 02/23] workaround --- corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/README.md b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/README.md index f7800e0e..b1b1c466 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/README.md +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/README.md @@ -11,7 +11,7 @@ Preprocess text extracted from PDFs provided by WARP. Use [rye](https://rye.astral.sh/) to install the dependencies. ```bash -rye sync +RUSTFLAGS="-A invalid_reference_casting" rye sync ``` and then download the Bunkai sentence splitter model. From ad30f440802df6ab9ff4746c370d15dd8de80c55 Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Thu, 16 Jan 2025 12:40:38 +0900 Subject: [PATCH 03/23] update readme --- corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/README.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/README.md b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/README.md index b1b1c466..aa940fe5 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/README.md +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/README.md @@ -14,8 +14,18 @@ Use [rye](https://rye.astral.sh/) to install the dependencies. RUSTFLAGS="-A invalid_reference_casting" rye sync ``` -and then download the Bunkai sentence splitter model. +Then download the Bunkai sentence splitter model. ```bash rye run bunkai --model bunkai_model --setup ``` + +## Usage + +### Conversion + +This process converts text to remove unnecessary characters. + +```bash +rye run python scripts/convert.py --input-file --output-file +``` From 75f5a2ff90de1443a6b0c67dbdec3d2e972af419 Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Thu, 16 Jan 2025 12:44:10 +0900 Subject: [PATCH 04/23] tweak --- corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py | 1 - 1 file changed, 1 deletion(-) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py index 6b6ae61c..b6894d0f 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py @@ -144,6 +144,5 @@ def main() -> None: if __name__ == "__main__": logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - level=logging.INFO, ) main() From 0ef32711bf5c25062390223a3c8ea97f11987f80 Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Thu, 16 Jan 2025 14:12:06 +0900 Subject: [PATCH 05/23] [wip] concurrent processing --- .../ja/ja_warp_pdf/scripts/convert.py | 33 ++++++++++++++----- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py index b6894d0f..201c87bf 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py @@ -1,15 +1,21 @@ """Remove intra-sentence line breaks from text.""" import argparse +import concurrent +import concurrent.futures import json import logging +import os import pathlib import bunkai import tqdm +import torch logger = logging.getLogger(__name__) +torch.set_num_threads(1) + root = pathlib.Path(__file__).parent.parent model_path = root / "bunkai_model" senter = bunkai.Bunkai(path_model=model_path) @@ -39,12 +45,12 @@ def split_text_by_period(text: str) -> list[str]: return chunks -def split_text_by_length(text: str, max_length: int = 100) -> list[str]: +def split_text_by_length(text: str, max_length: int = 30) -> list[str]: """Split text by length. Args: text (str): Input text. - max_length (int, optional): Maximum length of sentence. Defaults to 100. + max_length (int, optional): Maximum length of sentence. Defaults to 30. Returns: list[str]: List of sentences. @@ -126,19 +132,28 @@ def main() -> None: parser = argparse.ArgumentParser("Remove intra-sentence line breaks from text.") parser.add_argument("--input-file", type=str, required=True, help="Input file.") parser.add_argument("--output-file", type=str, required=True, help="Output file.") + parser.add_argument("--num-workers", type=int, default=1, help="Number of workers.") args = parser.parse_args() with ( open(args.input_file, "rt", encoding="utf-8") as fin, open(args.output_file, "wt", encoding="utf-8") as fout, ): - for i, line in enumerate(tqdm.tqdm(fin), start=1): - try: - line = process_line(line) - except Exception as e: - logger.error(f"Error processing line {i}") - logger.error(e) - fout.write(line) + with concurrent.futures.ThreadPoolExecutor( + max_workers=args.num_workers + ) as executor: + future_to_line = {} + for i, line in enumerate(fin, start=1): + future = executor.submit(process_line, line) + future_to_line[future] = line + for future in tqdm.tqdm(future_to_line, total=i): + try: + fout.write(future.result()) + except Exception as e: + logger.error(f"Error processing line {i}") + logger.error(e) + line = future_to_line[future] + fout.write(line) if __name__ == "__main__": From f33b28dbc1e6c6f8a0f608b0208ac41ca96d4dc7 Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Thu, 16 Jan 2025 16:56:34 +0900 Subject: [PATCH 06/23] tweak --- .../ja/ja_warp_pdf/scripts/convert.py | 110 ++++++++++-------- .../ja/ja_warp_pdf/scripts/test_convert.py | 17 --- 2 files changed, 60 insertions(+), 67 deletions(-) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py index 201c87bf..8ae47104 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py @@ -5,8 +5,8 @@ import concurrent.futures import json import logging -import os import pathlib +from typing import Iterator, TextIO import bunkai import tqdm @@ -21,31 +21,7 @@ senter = bunkai.Bunkai(path_model=model_path) -def split_text_by_period(text: str) -> list[str]: - """Split text by period. - - Args: - text (str): Input text. - - Returns: - list[str]: List of sentences. - """ - if not text: - return [""] - - chunks: list[str] = [] - chunk: str = "" - for char in text: - chunk += char - if char == "。": - chunks.append(chunk) - chunk = "" - if chunk: - chunks.append(chunk) - return chunks - - -def split_text_by_length(text: str, max_length: int = 30) -> list[str]: +def split_text_by_length(text: str, max_length: int = 20) -> list[str]: """Split text by length. Args: @@ -109,16 +85,16 @@ def process_line(line: str) -> str: text = dat["text"] new_text = "" - for chunk in split_text_by_period(text): - # Split large chunk to avoid long processing time - for chunk in split_text_by_length(chunk, max_length=100): - # Skip sentence splitting by bunkai if there is no line break as it is slow - if "\n" not in chunk: - new_text += chunk - continue + # Split into small chunks to avoid long processing time + for chunk in split_text_by_length(text): + # Skip sentence splitting by bunkai if there is no line break + # as it aims to remove intra-sentence line breaks + if "\n" not in chunk: + new_text += chunk + continue - for chunk in split_text_by_bunkai(chunk): - new_text += remove_intra_sentence_line_breaks(chunk) + for chunk in split_text_by_bunkai(chunk): + new_text += remove_intra_sentence_line_breaks(chunk) assert text.replace("\n", "") == new_text.replace("\n", "") @@ -127,33 +103,67 @@ def process_line(line: str) -> str: return json.dumps(dat, ensure_ascii=False) + "\n" +def process_lines(lines: list[str]) -> str: + """Process lines. + + Args: + lines (list[str]): Input lines. + + Returns: + str: Processed lines. + """ + ret: str = "" + for line in lines: + try: + ret += process_line(line) + except Exception as e: + logger.error(f"Error: {e}") + ret += line + return ret + + +def buffered_read(file: TextIO, buffer_size: int = 32) -> Iterator[list[str]]: + """Buffered read. + + Args: + file: File object. + buffer_size: Buffer size. + + Yields: + str: Line. + """ + lines: list[str] = [] + for line in file: + lines.append(line) + if len(lines) == buffer_size: + yield lines + lines = [] + if lines: + yield lines + + def main() -> None: """Main function.""" parser = argparse.ArgumentParser("Remove intra-sentence line breaks from text.") parser.add_argument("--input-file", type=str, required=True, help="Input file.") parser.add_argument("--output-file", type=str, required=True, help="Output file.") parser.add_argument("--num-workers", type=int, default=1, help="Number of workers.") + parser.add_argument("--buffer-size", type=int, default=32, help="Buffer size.") args = parser.parse_args() with ( open(args.input_file, "rt", encoding="utf-8") as fin, open(args.output_file, "wt", encoding="utf-8") as fout, ): - with concurrent.futures.ThreadPoolExecutor( - max_workers=args.num_workers - ) as executor: - future_to_line = {} - for i, line in enumerate(fin, start=1): - future = executor.submit(process_line, line) - future_to_line[future] = line - for future in tqdm.tqdm(future_to_line, total=i): - try: - fout.write(future.result()) - except Exception as e: - logger.error(f"Error processing line {i}") - logger.error(e) - line = future_to_line[future] - fout.write(line) + with concurrent.futures.ProcessPoolExecutor(args.num_workers) as executor: + futures = [] + for lines in buffered_read(fin, buffer_size=args.buffer_size): + futures.append(executor.submit(process_lines, lines)) + for future in tqdm.tqdm( + concurrent.futures.as_completed(futures), + total=len(futures), + ): + fout.write(future.result()) if __name__ == "__main__": diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/test_convert.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/test_convert.py index 2f6109e9..226df151 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/test_convert.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/test_convert.py @@ -1,29 +1,12 @@ import pytest from convert import ( - split_text_by_period, split_text_by_length, split_text_by_bunkai, remove_intra_sentence_line_breaks, ) -@pytest.mark.parametrize( - "text, expected", - [ - ("これはペンです。それは本です。", ["これはペンです。", "それは本です。"]), - ("これはペンです。それは本です", ["これはペンです。", "それは本です"]), - ("これはペンです。", ["これはペンです。"]), - ("これはペンです", ["これはペンです"]), - ("", [""]), - ("これはペンです。。。", ["これはペンです。", "。", "。"]), - ("。。。", ["。", "。", "。"]), - ], -) -def test_split_text_by_period(text: str, expected: list[str]) -> None: - assert split_text_by_period(text) == expected - - @pytest.mark.parametrize( "text, expected", [ From 91afaea7a8209defceb7fc2d3941ece59db55942 Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Thu, 16 Jan 2025 16:57:46 +0900 Subject: [PATCH 07/23] fix --- corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py index 8ae47104..a6276a21 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py @@ -51,7 +51,7 @@ def split_text_by_bunkai(text: str) -> list[str]: """ if not text: return [""] - return list(senter(text)) + return [s.replace("▁", "\n") for s in senter(text.replace("\n", "▁"))] def remove_intra_sentence_line_breaks(text: str) -> str: From bd1312e39c33f889130c2c0cab6a8e314d9f4da7 Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Thu, 16 Jan 2025 18:02:09 +0900 Subject: [PATCH 08/23] only process span around newline --- .../ja/ja_warp_pdf/scripts/convert.py | 61 ++++++++++++++----- .../ja/ja_warp_pdf/scripts/test_convert.py | 14 +++-- 2 files changed, 54 insertions(+), 21 deletions(-) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py index a6276a21..8823cae8 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py @@ -21,22 +21,48 @@ senter = bunkai.Bunkai(path_model=model_path) -def split_text_by_length(text: str, max_length: int = 20) -> list[str]: - """Split text by length. +def split_text_by_newline(text: str, window: int = 5) -> list[str]: + """Split text into chunks so that: + - Concatinating all chunks gives the original text. + - Each newline character has `window` characters before and after it at least, except for the first and last chunks. Args: text (str): Input text. - max_length (int, optional): Maximum length of sentence. Defaults to 30. Returns: - list[str]: List of sentences. + list[str]: List of chunks. + + Example: + >>> list(split_text_by_newline("Hello World\n")) + ["Hello ", "World\n"] + >>> list(split_text_by_newline("Hello\nWorld")) + ["Hello\nWorld"] + >>> list(split_text_by_newline("Hello\nWorld\n")) + ["Hello\nWorld\n"] + >>> list(split_text_by_newline("Hello\nWorld\nHello\nWorld\n")) + ["Hello\nWorld\nHello\nWorld\n"] + >>> list(split_text_by_newline("CONTEXT|Hello\nWorld\nHello\nWorld|CONTEXT")) + ["CONTEXT|", "Hello\nWorld\nHello\nWorld", "|CONTEXT"] """ - if not text: - return [""] - + if "\n" not in text: + return [text] + chunks: list[str] = [] - for i in range(0, len(text), max_length): - chunks.append(text[i : i + max_length]) + chunk: str = "" + newline_pos: int = -1 + for i, char in enumerate(text): + if char == "\n": + newline_pos = i + if len(chunk) > window and "\n" not in chunk: + chunks.append(chunk[:-window]) + chunk = chunk[-window:] + if newline_pos != -1 and i - newline_pos == window + 1: + chunks.append(chunk) + chunk = "" + newline_pos = -1 + chunk += char + if chunk: + chunks.append(chunk) return chunks @@ -83,18 +109,23 @@ def process_line(line: str) -> str: """ dat = json.loads(line) - text = dat["text"] - new_text = "" - # Split into small chunks to avoid long processing time - for chunk in split_text_by_length(text): + text: str = dat["text"] + new_text: str = "" + for chunk in split_text_by_newline(text, window=5): # Skip sentence splitting by bunkai if there is no line break # as it aims to remove intra-sentence line breaks if "\n" not in chunk: new_text += chunk continue + + # Skip long chunks as they are usually so noisy that + # bunkai will not work well + if len(chunk) > 20: + new_text += chunk + continue - for chunk in split_text_by_bunkai(chunk): - new_text += remove_intra_sentence_line_breaks(chunk) + for sent in split_text_by_bunkai(chunk): + new_text += remove_intra_sentence_line_breaks(sent) assert text.replace("\n", "") == new_text.replace("\n", "") diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/test_convert.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/test_convert.py index 226df151..bb386da0 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/test_convert.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/test_convert.py @@ -1,7 +1,7 @@ import pytest from convert import ( - split_text_by_length, + split_text_by_newline, split_text_by_bunkai, remove_intra_sentence_line_breaks, ) @@ -10,13 +10,15 @@ @pytest.mark.parametrize( "text, expected", [ - ("", [""]), - ("これはペンです。", ["これは", "ペンで", "す。"]), - ("これはペンです", ["これは", "ペンで", "す"]), + ("Hello World\n", ["Hello ", "World\n"]), + ("Hello\nWorld", ["Hello\nWorld"]), + ("Hello\nWorld\n", ["Hello\nWorld\n"]), + ("Hello\nWorld\nHello\nWorld\n", ["Hello\nWorld\nHello\nWorld\n"]), + ("CONTEXT|Hello\nWorld\nHello\nWorld|CONTEXT", ["CONTEXT|", "Hello\nWorld\nHello\nWorld", "|CONTEXT"]), ], ) -def test_split_text_by_length(text: str, expected: list[str]) -> None: - assert split_text_by_length(text, 3) == expected +def test_split_text_by_newline(text: str, expected: list[str]) -> None: + assert split_text_by_newline(text) == expected @pytest.mark.parametrize( From 46a8c8b06fee409beb07107cbdb225c3383cd463 Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Thu, 16 Jan 2025 18:02:33 +0900 Subject: [PATCH 09/23] lint --- .../llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py | 10 +++++----- .../ja/ja_warp_pdf/scripts/test_convert.py | 5 ++++- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py index 8823cae8..65d23409 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py @@ -31,7 +31,7 @@ def split_text_by_newline(text: str, window: int = 5) -> list[str]: Returns: list[str]: List of chunks. - + Example: >>> list(split_text_by_newline("Hello World\n")) ["Hello ", "World\n"] @@ -46,7 +46,7 @@ def split_text_by_newline(text: str, window: int = 5) -> list[str]: """ if "\n" not in text: return [text] - + chunks: list[str] = [] chunk: str = "" newline_pos: int = -1 @@ -117,7 +117,7 @@ def process_line(line: str) -> str: if "\n" not in chunk: new_text += chunk continue - + # Skip long chunks as they are usually so noisy that # bunkai will not work well if len(chunk) > 20: @@ -155,11 +155,11 @@ def process_lines(lines: list[str]) -> str: def buffered_read(file: TextIO, buffer_size: int = 32) -> Iterator[list[str]]: """Buffered read. - + Args: file: File object. buffer_size: Buffer size. - + Yields: str: Line. """ diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/test_convert.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/test_convert.py index bb386da0..52751283 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/test_convert.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/test_convert.py @@ -14,7 +14,10 @@ ("Hello\nWorld", ["Hello\nWorld"]), ("Hello\nWorld\n", ["Hello\nWorld\n"]), ("Hello\nWorld\nHello\nWorld\n", ["Hello\nWorld\nHello\nWorld\n"]), - ("CONTEXT|Hello\nWorld\nHello\nWorld|CONTEXT", ["CONTEXT|", "Hello\nWorld\nHello\nWorld", "|CONTEXT"]), + ( + "CONTEXT|Hello\nWorld\nHello\nWorld|CONTEXT", + ["CONTEXT|", "Hello\nWorld\nHello\nWorld", "|CONTEXT"], + ), ], ) def test_split_text_by_newline(text: str, expected: list[str]) -> None: From b940da552ac545fd46f2fdd8821740fdd1093019 Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Fri, 17 Jan 2025 14:21:10 +0900 Subject: [PATCH 10/23] update dependencies --- .../ja/ja_warp_pdf/pyproject.toml | 2 +- .../ja/ja_warp_pdf/requirements-dev.lock | 34 ++++++++++++++++++- .../ja/ja_warp_pdf/requirements.lock | 34 ++++++++++++++++++- 3 files changed, 67 insertions(+), 3 deletions(-) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/pyproject.toml b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/pyproject.toml index 75ae58a6..0434d064 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/pyproject.toml +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/pyproject.toml @@ -6,7 +6,7 @@ authors = [ { name = "Hirokazu Kiyomaru", email = "h.kiyomaru@gmail.com" } ] dependencies = [ - "bunkai[lb]>=1.5.7", + "bunkai[lb] @ git+https://github.com/hkiyomaru/bunkai@feature/sequential-prediction", "transformers==4.33.3", "pytest>=8.3.4", ] diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/requirements-dev.lock b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/requirements-dev.lock index 9c9e914c..9bf31530 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/requirements-dev.lock +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/requirements-dev.lock @@ -9,7 +9,7 @@ # generate-hashes: false # universal: false -bunkai==1.5.7 +bunkai @ git+https://github.com/hkiyomaru/bunkai@12a2dfa9eb47e203f4135edac3087a2809d92ca7 certifi==2024.12.14 # via requests charset-normalizer==3.4.1 @@ -24,6 +24,7 @@ filelock==3.16.1 # via huggingface-hub # via torch # via transformers + # via triton fsspec==2024.12.0 # via huggingface-hub # via torch @@ -52,6 +53,35 @@ networkx==3.4.2 numpy==2.2.1 # via bunkai # via transformers +nvidia-cublas-cu12==12.4.5.8 + # via nvidia-cudnn-cu12 + # via nvidia-cusolver-cu12 + # via torch +nvidia-cuda-cupti-cu12==12.4.127 + # via torch +nvidia-cuda-nvrtc-cu12==12.4.127 + # via torch +nvidia-cuda-runtime-cu12==12.4.127 + # via torch +nvidia-cudnn-cu12==9.1.0.70 + # via torch +nvidia-cufft-cu12==11.2.1.3 + # via torch +nvidia-curand-cu12==10.3.5.147 + # via torch +nvidia-cusolver-cu12==11.6.1.9 + # via torch +nvidia-cusparse-cu12==12.3.1.170 + # via nvidia-cusolver-cu12 + # via torch +nvidia-nccl-cu12==2.21.5 + # via torch +nvidia-nvjitlink-cu12==12.4.127 + # via nvidia-cusolver-cu12 + # via nvidia-cusparse-cu12 + # via torch +nvidia-nvtx-cu12==12.4.127 + # via torch packaging==24.2 # via huggingface-hub # via marshmallow @@ -90,6 +120,8 @@ tqdm==4.67.1 # via transformers transformers==4.33.3 # via bunkai +triton==3.1.0 + # via torch typing-extensions==4.12.2 # via huggingface-hub # via torch diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/requirements.lock b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/requirements.lock index 9c9e914c..9bf31530 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/requirements.lock +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/requirements.lock @@ -9,7 +9,7 @@ # generate-hashes: false # universal: false -bunkai==1.5.7 +bunkai @ git+https://github.com/hkiyomaru/bunkai@12a2dfa9eb47e203f4135edac3087a2809d92ca7 certifi==2024.12.14 # via requests charset-normalizer==3.4.1 @@ -24,6 +24,7 @@ filelock==3.16.1 # via huggingface-hub # via torch # via transformers + # via triton fsspec==2024.12.0 # via huggingface-hub # via torch @@ -52,6 +53,35 @@ networkx==3.4.2 numpy==2.2.1 # via bunkai # via transformers +nvidia-cublas-cu12==12.4.5.8 + # via nvidia-cudnn-cu12 + # via nvidia-cusolver-cu12 + # via torch +nvidia-cuda-cupti-cu12==12.4.127 + # via torch +nvidia-cuda-nvrtc-cu12==12.4.127 + # via torch +nvidia-cuda-runtime-cu12==12.4.127 + # via torch +nvidia-cudnn-cu12==9.1.0.70 + # via torch +nvidia-cufft-cu12==11.2.1.3 + # via torch +nvidia-curand-cu12==10.3.5.147 + # via torch +nvidia-cusolver-cu12==11.6.1.9 + # via torch +nvidia-cusparse-cu12==12.3.1.170 + # via nvidia-cusolver-cu12 + # via torch +nvidia-nccl-cu12==2.21.5 + # via torch +nvidia-nvjitlink-cu12==12.4.127 + # via nvidia-cusolver-cu12 + # via nvidia-cusparse-cu12 + # via torch +nvidia-nvtx-cu12==12.4.127 + # via torch packaging==24.2 # via huggingface-hub # via marshmallow @@ -90,6 +120,8 @@ tqdm==4.67.1 # via transformers transformers==4.33.3 # via bunkai +triton==3.1.0 + # via torch typing-extensions==4.12.2 # via huggingface-hub # via torch From 8a8ea038d7664785c9b0e5b2da01afd416174380 Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Fri, 17 Jan 2025 19:53:06 +0900 Subject: [PATCH 11/23] fix --- .../ja/ja_warp_pdf/scripts/convert.py | 2 ++ .../ja/ja_warp_pdf/scripts/filter.py | 0 .../ja/ja_warp_pdf/scripts/test_convert.py | 20 +++++++++++++++++++ 3 files changed, 22 insertions(+) create mode 100644 corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/filter.py diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py index 65d23409..b474f20c 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py @@ -89,6 +89,8 @@ def remove_intra_sentence_line_breaks(text: str) -> str: Returns: str: Processed text. """ + if all(char == "\n" for char in text): + return text num_leading_newlines = len(text) - len(text.lstrip("\n")) num_trailing_newlines = len(text) - len(text.rstrip("\n")) return ( diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/filter.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/filter.py new file mode 100644 index 00000000..e69de29b diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/test_convert.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/test_convert.py index 52751283..a9f8ab3a 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/test_convert.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/test_convert.py @@ -39,6 +39,10 @@ def test_split_text_by_newline(text: str, expected: list[str]) -> None: "改行が文区切り\nです\nこういう日本語\nの文章はよくあ\nります\n", ["改行が文区切り\nです\n", "こういう日本語\nの文章はよくあ\nります\n"], ), + ( + "\n", + ["\n"], + ), ], ) def test_split_text_by_bunkai(text: str, expected: list[str]) -> None: @@ -56,6 +60,22 @@ def test_split_text_by_bunkai(text: str, expected: list[str]) -> None: "\n\nこういう\n日本語の文章は\nよくあります\n\n", "\n\nこういう日本語の文章はよくあります\n\n", ), + ( + "\nこういう\n日本語の文章は\nよくあります\n", + "\nこういう日本語の文章はよくあります\n", + ), + ( + "\n\n\nこういう\n日本語の文章は\nよくあります\n", + "\n\n\nこういう日本語の文章はよくあります\n", + ), + ( + "\n", + "\n", + ), + ( + "\n\n", + "\n\n", + ), ], ) def test_remove_intra_sentence_line_breaks(text: str, expected: str) -> None: From 00a8bee0163da24bb0a283d49431b289e6b7936f Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Sat, 18 Jan 2025 13:48:46 +0900 Subject: [PATCH 12/23] keep order --- corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py index b474f20c..aa38cde1 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py @@ -192,10 +192,7 @@ def main() -> None: futures = [] for lines in buffered_read(fin, buffer_size=args.buffer_size): futures.append(executor.submit(process_lines, lines)) - for future in tqdm.tqdm( - concurrent.futures.as_completed(futures), - total=len(futures), - ): + for future in tqdm.tqdm(futures): fout.write(future.result()) From f53a3302616f2fb79ee9c64ba07d2fcfdb892ec1 Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Sat, 18 Jan 2025 13:56:44 +0900 Subject: [PATCH 13/23] use cpu count as default parallelism --- .../llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py index aa38cde1..7c65fbc6 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py @@ -5,6 +5,7 @@ import concurrent.futures import json import logging +import os import pathlib from typing import Iterator, TextIO @@ -180,15 +181,19 @@ def main() -> None: parser = argparse.ArgumentParser("Remove intra-sentence line breaks from text.") parser.add_argument("--input-file", type=str, required=True, help="Input file.") parser.add_argument("--output-file", type=str, required=True, help="Output file.") - parser.add_argument("--num-workers", type=int, default=1, help="Number of workers.") + parser.add_argument("--num-workers", type=int, default=-1, help="Number of workers.") parser.add_argument("--buffer-size", type=int, default=32, help="Buffer size.") args = parser.parse_args() + num_workers = args.num_workers if args.num_workers != -1 else os.cpu_count() + + os.makedirs(os.path.dirname(args.output_file), exist_ok=True) + with ( open(args.input_file, "rt", encoding="utf-8") as fin, open(args.output_file, "wt", encoding="utf-8") as fout, ): - with concurrent.futures.ProcessPoolExecutor(args.num_workers) as executor: + with concurrent.futures.ProcessPoolExecutor(num_workers) as executor: futures = [] for lines in buffered_read(fin, buffer_size=args.buffer_size): futures.append(executor.submit(process_lines, lines)) From 1ff7b3b4f34a3bf3cd7185269c17f1111377350c Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Mon, 20 Jan 2025 09:30:39 +0900 Subject: [PATCH 14/23] add overwrite option --- .../ja/ja_warp_pdf/scripts/convert.py | 30 +++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py index 7c65fbc6..a0c20d21 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py @@ -156,11 +156,16 @@ def process_lines(lines: list[str]) -> str: return ret -def buffered_read(file: TextIO, buffer_size: int = 32) -> Iterator[list[str]]: +def buffered_read( + file: TextIO, + processed_ids: set[str], + buffer_size: int = 32, +) -> Iterator[list[str]]: """Buffered read. Args: file: File object. + processed_ids: Processed IDs. buffer_size: Buffer size. Yields: @@ -168,6 +173,9 @@ def buffered_read(file: TextIO, buffer_size: int = 32) -> Iterator[list[str]]: """ lines: list[str] = [] for line in file: + dat = json.loads(line) + if dat["docId"] in processed_ids: + continue lines.append(line) if len(lines) == buffer_size: yield lines @@ -183,19 +191,37 @@ def main() -> None: parser.add_argument("--output-file", type=str, required=True, help="Output file.") parser.add_argument("--num-workers", type=int, default=-1, help="Number of workers.") parser.add_argument("--buffer-size", type=int, default=32, help="Buffer size.") + parser.add_argument("--overwrite", action="store_true", help="Overwrite output file.") args = parser.parse_args() num_workers = args.num_workers if args.num_workers != -1 else os.cpu_count() os.makedirs(os.path.dirname(args.output_file), exist_ok=True) + # Create an empty file if overwrite is True + if args.overwrite: + with open(args.output_file, "wt", encoding="utf-8"): + pass + + # Get processed lines if overwrite is False + processed_ids: set[str] = set() + if not args.overwrite and os.path.exists(args.output_file): + with open(args.output_file, "rt", encoding="utf-8") as fin: + for line in fin: + dat = json.loads(line) + processed_ids.add(dat["docId"]) + with ( open(args.input_file, "rt", encoding="utf-8") as fin, open(args.output_file, "wt", encoding="utf-8") as fout, ): with concurrent.futures.ProcessPoolExecutor(num_workers) as executor: futures = [] - for lines in buffered_read(fin, buffer_size=args.buffer_size): + for lines in buffered_read( + fin, + processed_ids=processed_ids, + buffer_size=args.buffer_size, + ): futures.append(executor.submit(process_lines, lines)) for future in tqdm.tqdm(futures): fout.write(future.result()) From 9f9b787eb8a05cf0728cbbec319790c5447e1d0d Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Mon, 20 Jan 2025 09:31:07 +0900 Subject: [PATCH 15/23] fmt --- .../llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py index a0c20d21..f1f229f2 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py @@ -189,15 +189,19 @@ def main() -> None: parser = argparse.ArgumentParser("Remove intra-sentence line breaks from text.") parser.add_argument("--input-file", type=str, required=True, help="Input file.") parser.add_argument("--output-file", type=str, required=True, help="Output file.") - parser.add_argument("--num-workers", type=int, default=-1, help="Number of workers.") + parser.add_argument( + "--num-workers", type=int, default=-1, help="Number of workers." + ) parser.add_argument("--buffer-size", type=int, default=32, help="Buffer size.") - parser.add_argument("--overwrite", action="store_true", help="Overwrite output file.") + parser.add_argument( + "--overwrite", action="store_true", help="Overwrite output file." + ) args = parser.parse_args() num_workers = args.num_workers if args.num_workers != -1 else os.cpu_count() os.makedirs(os.path.dirname(args.output_file), exist_ok=True) - + # Create an empty file if overwrite is True if args.overwrite: with open(args.output_file, "wt", encoding="utf-8"): From fd49bd2802c252160067849ef0b811b1a5b7e782 Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Mon, 20 Jan 2025 09:31:34 +0900 Subject: [PATCH 16/23] udpate --- corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py index f1f229f2..6ecec804 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py @@ -192,7 +192,7 @@ def main() -> None: parser.add_argument( "--num-workers", type=int, default=-1, help="Number of workers." ) - parser.add_argument("--buffer-size", type=int, default=32, help="Buffer size.") + parser.add_argument("--buffer-size", type=int, default=16, help="Buffer size.") parser.add_argument( "--overwrite", action="store_true", help="Overwrite output file." ) From 3064667ad7f2e63e17ee08daea7261a07596cde2 Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Mon, 20 Jan 2025 09:35:03 +0900 Subject: [PATCH 17/23] fix --- corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py index 6ecec804..dee814f3 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py @@ -217,7 +217,7 @@ def main() -> None: with ( open(args.input_file, "rt", encoding="utf-8") as fin, - open(args.output_file, "wt", encoding="utf-8") as fout, + open(args.output_file, "at", encoding="utf-8") as fout, ): with concurrent.futures.ProcessPoolExecutor(num_workers) as executor: futures = [] From 07d59532d4ad1f8a19f4e2dc6c13b6dadb98e268 Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Thu, 23 Jan 2025 14:42:30 +0900 Subject: [PATCH 18/23] fix --- .../ja/ja_warp_pdf/scripts/convert.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py index dee814f3..3644eb0c 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/convert.py @@ -180,8 +180,7 @@ def buffered_read( if len(lines) == buffer_size: yield lines lines = [] - if lines: - yield lines + yield lines def main() -> None: @@ -192,7 +191,7 @@ def main() -> None: parser.add_argument( "--num-workers", type=int, default=-1, help="Number of workers." ) - parser.add_argument("--buffer-size", type=int, default=16, help="Buffer size.") + parser.add_argument("--buffer-size", type=int, default=32, help="Buffer size.") parser.add_argument( "--overwrite", action="store_true", help="Overwrite output file." ) @@ -227,8 +226,12 @@ def main() -> None: buffer_size=args.buffer_size, ): futures.append(executor.submit(process_lines, lines)) - for future in tqdm.tqdm(futures): + for future in tqdm.tqdm( + concurrent.futures.as_completed(futures), + total=len(futures), + ): fout.write(future.result()) + fout.flush() if __name__ == "__main__": From 1dd1275b53ff2f1e825399c03add7f00783a35ed Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Thu, 30 Jan 2025 09:30:22 +0900 Subject: [PATCH 19/23] update --- .../ja/ja_warp_pdf/scripts/filter.py | 163 ++++++++++++++++++ 1 file changed, 163 insertions(+) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/filter.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/filter.py index e69de29b..1175b3c8 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/filter.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/filter.py @@ -0,0 +1,163 @@ +"""Remove low-quality documents.""" +import argparse +import logging +import json +import os +import concurrent +import concurrent.futures +from typing import Iterator, TextIO + +import tqdm + +logger = logging.getLogger(__name__) + + +def get_line_break_or_white_space_ratio(text: str) -> float: + """Get the ratio of line breaks and white spaces in the text. + + Args: + text: The text to analyze. + + Returns: + The ratio of line breaks and white spaces in the text. + """ + if text == "": + return 0.0 + return (text.count("\n") + text.count(" ")) / len(text) + + +def get_short_line_ratio(text: str, threshold: int = 5) -> float: + """Get the ratio of characters in short lines in the text. + + Args: + text: The text to analyze. + threshold: The threshold to determine if a line is short. + + Returns: + The ratio of short lines in the text. + """ + if text == "": + return 0.0 + lines = [line for line in text.split("\n") if line.strip()] + if len(lines) == 0: + return 0.0 + short_lines = [line for line in lines if len(line.replace(" ", "")) <= threshold] + return sum(map(len, short_lines)) / sum(map(len, lines)) + + +def process_line( + line: str, + line_break_or_white_space_ratio_threshold: float = 0.2, + short_line_ratio_threshold: float = 0.1, +) -> str: + """Process a line in the input file. + + Args: + line: A line in the input file. + line_break_or_white_space_ratio_threshold: The threshold of the ratio of line breaks and white spaces. + short_line_ratio_threshold: The threshold of the ratio of short lines. + """ + try: + row = json.loads(line) + except Exception as e: + logging.error(f"Error: {e}") + return line + assert "meta" not in row + text = row["text"] + row["meta"] = { + "line_break_or_white_space_ratio": get_line_break_or_white_space_ratio(text), + "short_line_ratio": get_short_line_ratio(text), + } + if ( + row["meta"]["line_break_or_white_space_ratio"] + > line_break_or_white_space_ratio_threshold + ): + return "" + if row["meta"]["short_line_ratio"] > short_line_ratio_threshold: + return "" + return json.dumps(row, ensure_ascii=False) + "\n" + + +def process_lines(lines: list[str]) -> str: + """Process lines. + + Args: + lines (list[str]): Input lines. + + Returns: + str: Processed lines. + """ + ret: str = "" + for line in lines: + ret += process_line(line) + return ret + + +def buffered_read(file: TextIO, buffer_size: int = 32) -> Iterator[list[str]]: + """Buffered read. + + Args: + file: File object. + processed_ids: Processed IDs. + buffer_size: Buffer size. + + Yields: + str: Line. + """ + lines: list[str] = [] + for line in file: + lines.append(line) + if len(lines) == buffer_size: + yield lines + lines = [] + yield lines + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "--input-file", + type=str, + required=True, + help="Path to the input file.", + ) + parser.add_argument( + "--output-file", + type=str, + required=True, + help="Path to the output file.", + ) + parser.add_argument( + "--num-workers", + type=int, + default=1, + help="Number of workers for multiprocessing.", + ) + parser.add_argument("--buffer-size", type=int, default=256, help="Buffer size.") + args = parser.parse_args() + + num_workers = args.num_workers if args.num_workers != -1 else os.cpu_count() + + os.makedirs(os.path.dirname(args.output_file), exist_ok=True) + + with ( + open(args.input_file, "rt", encoding="utf-8") as fin, + open(args.output_file, "wt", encoding="utf-8") as fout, + ): + with concurrent.futures.ProcessPoolExecutor(num_workers) as executor: + futures = [] + for lines in buffered_read(fin, buffer_size=args.buffer_size): + futures.append(executor.submit(process_lines, lines)) + for future in tqdm.tqdm( + concurrent.futures.as_completed(futures), + total=len(futures), + ): + fout.write(future.result()) + fout.flush() + + +if __name__ == "__main__": + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(message)s", + ) + main() From a8d521614fd34a6fac868b741c4780cde59a5f6e Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Thu, 30 Jan 2025 09:39:54 +0900 Subject: [PATCH 20/23] remove docs wo meta --- corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/filter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/filter.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/filter.py index 1175b3c8..c2c7690a 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/filter.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/filter.py @@ -62,7 +62,8 @@ def process_line( except Exception as e: logging.error(f"Error: {e}") return line - assert "meta" not in row + if "meta" not in row: + return "" # Timeout or other errors text = row["text"] row["meta"] = { "line_break_or_white_space_ratio": get_line_break_or_white_space_ratio(text), From 9cea63f92f07748c1e17eb851ebf00505e46662d Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Thu, 30 Jan 2025 09:46:57 +0900 Subject: [PATCH 21/23] add launch.py --- .../ja/ja_warp_pdf/scripts/launch.py | 118 ++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/launch.py diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/launch.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/launch.py new file mode 100644 index 00000000..33756669 --- /dev/null +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/launch.py @@ -0,0 +1,118 @@ +import argparse +import logging +from pathlib import Path +import subprocess +from queue import Queue +from time import sleep + + +logger = logging.getLogger(__name__) + +here = Path(__file__).parent + +script_paths = { + "convert": here / "convert.py", + "filter": here / "filter.py", +} + +interpreter_path = here.parent / ".venv" / "bin" / "python" + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "MODE", + type=str, + choices=["test", "convert", "filter"], + help="Mode to run", + ) + parser.add_argument( + "--input-dir", + type=str, + required=True, + help="Path to the input file", + ) + parser.add_argument( + "--output-dir", + type=str, + required=True, + help="Path to the output file", + ) + parser.add_argument( + "--hosts", + type=str, + required=True, + help="Path to the hosts file", + ) + args = parser.parse_args() + + script_path = script_paths[args.MODE] + + input_dir = Path(args.input_dir) + output_dir = Path(args.output_dir) + + queue = Queue() + for input_path in input_dir.glob("**/*.jsonl"): + output_path = output_dir / input_path.relative_to(input_dir) + command = [ + str(interpreter_path), + str(script_path), + "--input-file", + str(input_path), + "--output-file", + str(output_path), + ] + queue.put(command) + + waiting = Queue() + with open(args.hosts) as f: + for line in f: + host = line.strip() + proc = subprocess.Popen( + ["ssh", "-i", "~/.ssh/id_ed_25519", "-o", "StrictHostKeyChecking=no", host, "true"], + stderr=subprocess.PIPE, + ) + if proc.wait() != 0: + logger.error(f"Host {host} is not available") + continue + logger.info(f"Host {host} is available") + waiting.put(line.strip()) + + logger.info(f"Available hosts: {waiting.qsize()}") + + running = [] + while not queue.empty() or len(running) > 0: + while not waiting.empty() and not queue.empty(): + host = waiting.get() + command = queue.get() + logger.info(f"Running {command} on {host}") + proc = subprocess.Popen( + ["ssh", "-i", "~/.ssh/id_ed_25519", "-o", "StrictHostKeyChecking=no", host] + command, + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + ) + running.append((host, proc)) + + for host, proc in running: + if proc.poll() is not None: + if proc.returncode != 0: + error_message = proc.stderr.read().decode("utf-8") + logger.error(f"Failed {proc.args} on {host}: {error_message}") + queue.put(proc.args[6:]) + else: + logger.info(f"Finished {proc.args} on {host}") + running.remove((host, proc)) + waiting.put(host) + break + + sleep(1) + + logger.info("All done") + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + ) + main() From e2d476bc689ab6cad5c68141cccdb575d0b50816 Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Thu, 30 Jan 2025 09:47:13 +0900 Subject: [PATCH 22/23] fmt --- .../ja/ja_warp_pdf/scripts/filter.py | 1 + .../ja/ja_warp_pdf/scripts/launch.py | 20 +++++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/filter.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/filter.py index c2c7690a..f0535f96 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/filter.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/filter.py @@ -1,4 +1,5 @@ """Remove low-quality documents.""" + import argparse import logging import json diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/launch.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/launch.py index 33756669..704eecbc 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/launch.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/launch.py @@ -69,7 +69,15 @@ def main() -> None: for line in f: host = line.strip() proc = subprocess.Popen( - ["ssh", "-i", "~/.ssh/id_ed_25519", "-o", "StrictHostKeyChecking=no", host, "true"], + [ + "ssh", + "-i", + "~/.ssh/id_ed_25519", + "-o", + "StrictHostKeyChecking=no", + host, + "true", + ], stderr=subprocess.PIPE, ) if proc.wait() != 0: @@ -87,7 +95,15 @@ def main() -> None: command = queue.get() logger.info(f"Running {command} on {host}") proc = subprocess.Popen( - ["ssh", "-i", "~/.ssh/id_ed_25519", "-o", "StrictHostKeyChecking=no", host] + command, + [ + "ssh", + "-i", + "~/.ssh/id_ed_25519", + "-o", + "StrictHostKeyChecking=no", + host, + ] + + command, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, ) From 99f1f7c15f9652b742d10833ab72770cf1c3314e Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Thu, 30 Jan 2025 13:02:48 +0900 Subject: [PATCH 23/23] fix --- corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/filter.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/filter.py b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/filter.py index f0535f96..25b87e52 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/filter.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_warp_pdf/scripts/filter.py @@ -63,13 +63,14 @@ def process_line( except Exception as e: logging.error(f"Error: {e}") return line - if "meta" not in row: - return "" # Timeout or other errors text = row["text"] + orig_meta = row.get("meta", {}) row["meta"] = { "line_break_or_white_space_ratio": get_line_break_or_white_space_ratio(text), "short_line_ratio": get_short_line_ratio(text), } + if orig_meta: + row["meta"]["meta"] = orig_meta if ( row["meta"]["line_break_or_white_space_ratio"] > line_break_or_white_space_ratio_threshold