Skip to content

Commit f94dcbd

Browse files
Change count to default to stdout and rename delim argument
1 parent b49747b commit f94dcbd

File tree

3 files changed

+60
-13
lines changed

3 files changed

+60
-13
lines changed

README.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,7 @@ and document count is not printed.
419419
The `count` subcommand can produce the counts of chunks in the input
420420
file. Unlike `summarize`, it counts chunk-type pairs, not just types.
421421
For example, if we run
422-
`seqscore count --labels BIO samples/reference.bio counts.csv`,
422+
`seqscore count --labels BIO samples/reference.bio --output-file counts.csv`,
423423
tab-delimited counts would be written to `counts.csv` as follows:
424424

425425
```
@@ -428,6 +428,14 @@ tab-delimited counts would be written to `counts.csv` as follows:
428428
1 LOC Pennsylvania
429429
```
430430

431+
You can also call `count` without the `--output-file` argument to print counts to
432+
standard output. However, you may encounter Unicode issues if your terminal is not
433+
configured properly.
434+
435+
You can use the `--output-delim` argument to change the delimiter used in the counts.
436+
The default delimiter of tab is strongly recommended, as there is no escaping or
437+
quoting of the names in the output.
438+
431439
## Process
432440

433441
The `process` subcommand can remove entity types from a file or map them to

seqscore/scripts/seqscore.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import json
22
import sys
33
from collections import Counter
4+
from contextlib import nullcontext
45
from typing import Callable, Optional
56

67
import click
@@ -273,32 +274,36 @@ def process(
273274

274275
@cli.command(help="show counts for all the mentions contained in a file")
275276
@_multi_input_file_arguments
276-
@click.argument("output_file")
277+
@click.option(
278+
"--output-file",
279+
default=None,
280+
help="path to write output to [default: stdout]",
281+
)
277282
@_repair_option()
278283
@_labels_option_default_bio()
279284
@click.option(
280-
"--delim",
285+
"--output-delim",
281286
default="\t",
282287
help="the delimiter to be used for output (has no effect on input) [default: tab]",
283288
)
284289
@_quiet_option()
285290
def count(
286291
file: list[str], # Name is "file" to make sense on the command line, but it's a list
287292
file_encoding: str,
288-
output_file: str,
293+
output_file: Optional[str],
289294
labels: str,
290295
*,
291296
ignore_document_boundaries: bool,
292297
parse_comment_lines: bool,
293-
delim: str,
298+
output_delim: str,
294299
repair_method: str,
295300
quiet: bool,
296301
) -> None:
297302
if repair_method == REPAIR_NONE:
298303
repair_method = None
299304

300-
delim = _normalize_tab(delim)
301-
if delim != "\t":
305+
output_delim = _normalize_tab(output_delim)
306+
if output_delim != "\t":
302307
print(
303308
"Warning: Using a delimiter other than tab is not recommended as fields are not quoted",
304309
file=sys.stderr,
@@ -322,9 +327,16 @@ def count(
322327
key = (mention.type, sequence.mention_tokens(mention))
323328
counts[key] += 1
324329

325-
with open(output_file, "w", encoding=file_encoding) as output:
330+
with (
331+
open(output_file, "w", encoding=file_encoding)
332+
if output_file
333+
else nullcontext(sys.stdout) as output
334+
):
326335
for item, item_count in counts.most_common():
327-
print(delim.join((str(item_count), item[0], " ".join(item[1]))), file=output)
336+
print(
337+
output_delim.join((str(item_count), item[0], " ".join(item[1]))),
338+
file=output,
339+
)
328340

329341

330342
@cli.command(help="show counts of the documents, sentences, and entity types")

tests/test_count_click.py

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ def test_count_BIO() -> None:
2929
"--labels",
3030
"BIO",
3131
os.path.join("tests", "conll_annotation", "minimal.bio"),
32+
"--output-file",
3233
os.path.join(TMP_DIR.name, "count_BIO_out.txt"),
3334
],
3435
)
@@ -39,6 +40,23 @@ def test_count_BIO() -> None:
3940
)
4041

4142

43+
def test_count_BIO_stdout() -> None:
44+
runner = CliRunner()
45+
result = runner.invoke(
46+
count,
47+
[
48+
"--labels",
49+
"BIO",
50+
os.path.join("tests", "conll_annotation", "minimal.bio"),
51+
],
52+
)
53+
assert result.exit_code == 0
54+
expected_lines = open(
55+
os.path.join("tests", "test_files", "count_minimal_ref.txt")
56+
).read()
57+
assert result.stdout == expected_lines
58+
59+
4260
def test_count_BIO_twofiles() -> None:
4361
runner = CliRunner()
4462
result = runner.invoke(
@@ -48,6 +66,7 @@ def test_count_BIO_twofiles() -> None:
4866
"BIO",
4967
os.path.join("tests", "conll_annotation", "minimal.bio"),
5068
os.path.join("tests", "conll_annotation", "minimal2.bio"),
69+
"--output-file",
5170
os.path.join(TMP_DIR.name, "count_BIO_out.txt"),
5271
],
5372
)
@@ -66,9 +85,10 @@ def test_count_BIO_tab1() -> None:
6685
[
6786
"--labels",
6887
"BIO",
69-
"--delim",
88+
"--output-delim",
7089
"\t", # Actual tab
7190
os.path.join("tests", "conll_annotation", "minimal.bio"),
91+
"--output-file",
7292
os.path.join(TMP_DIR.name, "count_BIO_out.txt"),
7393
],
7494
)
@@ -86,9 +106,10 @@ def test_count_BIO_tab2() -> None:
86106
[
87107
"--labels",
88108
"BIO",
89-
"--delim",
109+
"--output-delim",
90110
r"\t", # Backlash and t
91111
os.path.join("tests", "conll_annotation", "minimal.bio"),
112+
"--output-file",
92113
os.path.join(TMP_DIR.name, "count_BIO_out.txt"),
93114
],
94115
)
@@ -106,9 +127,10 @@ def test_count_BIO_tab3() -> None:
106127
[
107128
"--labels",
108129
"BIO",
109-
"--delim",
130+
"--output-delim",
110131
"tab", # Tab spelled out
111132
os.path.join("tests", "conll_annotation", "minimal.bio"),
133+
"--output-file",
112134
os.path.join(TMP_DIR.name, "count_BIO_out.txt"),
113135
],
114136
)
@@ -126,9 +148,10 @@ def test_count_BIO_comma() -> None:
126148
[
127149
"--labels",
128150
"BIO",
129-
"--delim",
151+
"--output-delim",
130152
",",
131153
os.path.join("tests", "conll_annotation", "minimal.bio"),
154+
"--output-file",
132155
os.path.join(TMP_DIR.name, "count_BIO_out.txt"),
133156
],
134157
)
@@ -149,6 +172,7 @@ def test_count_BIOES() -> None:
149172
"--repair-method",
150173
"none",
151174
os.path.join("tests", "conll_annotation", "minimal.bioes"),
175+
"--output-file",
152176
os.path.join(TMP_DIR.name, "count_BIOES_out.txt"),
153177
],
154178
)
@@ -169,6 +193,7 @@ def test_count_IO() -> None:
169193
"--repair-method",
170194
"none",
171195
os.path.join("tests", "conll_annotation", "minimal.io"),
196+
"--output-file",
172197
os.path.join(TMP_DIR.name, "count_IO_out.txt"),
173198
],
174199
)
@@ -189,6 +214,7 @@ def test_count_BIO_invalid_conlleval() -> None:
189214
"--repair-method",
190215
"conlleval",
191216
os.path.join("tests", "conll_annotation", "invalid1.bio"),
217+
"--output-file",
192218
os.path.join(TMP_DIR.name, "count_BIO_conlleval_out.txt"),
193219
],
194220
)
@@ -209,6 +235,7 @@ def test_count_BIO_invalid_discard() -> None:
209235
"--repair-method",
210236
"discard",
211237
os.path.join("tests", "conll_annotation", "invalid1.bio"),
238+
"--output-file",
212239
os.path.join(TMP_DIR.name, "count_BIO_discard_out.txt"),
213240
],
214241
)

0 commit comments

Comments
 (0)