Skip to content

Commit f52d491

Browse files
committed
Added a warning in sort.py when a column is not in pairsam_format.DTYPES_PAIRSAM, informing users it’s treated as a string, which may affect sorting if numeric.
Modified test_sort.py to create a temporary .pairsam file with custom_col in the header for test_custom_column_warning, ensuring the data type warning is triggered and tested.
1 parent 9ec09f1 commit f52d491

File tree

2 files changed

+69
-6
lines changed

2 files changed

+69
-6
lines changed

pairtools/cli/sort.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,9 @@
6868
nargs=1,
6969
type=str,
7070
multiple=True,
71-
help="Extra column (name or numerical index) that is also used for sorting."
72-
"The option can be provided multiple times."
73-
'Example: --extra-col "phase1" --extra-col "phase2". [output format option]',
71+
help="Extra column (name or numerical index) to sort by. "
72+
"If not defined in pairsam format, treated as a string and a warning is issued. "
73+
"Can be provided multiple times, e.g., --extra-col phase1 --extra-col phase2.",
7474
)
7575
@click.option(
7676
"--nproc",
@@ -230,8 +230,14 @@ def sort_py(
230230
for col in sort_columns:
231231
if col is None:
232232
continue # Skip optional columns
233-
dtype = pairsam_format.DTYPES_PAIRSAM.get(column_names[col], str)
234-
cols.append(f"-k {col+1},{col+1}{'n' if issubclass(dtype, int) else ''}")
233+
col_name = column_names[col]
234+
dtype = pairsam_format.DTYPES_PAIRSAM.get(col_name, str)
235+
if col_name not in pairsam_format.DTYPES_PAIRSAM:
236+
logger.warning(
237+
f"Column '{col_name}' not found in pairsam format definitions. "
238+
"Assuming string type for sorting, which may affect sort order if numeric."
239+
)
240+
cols.append(f"-k {col + 1},{col + 1}{'n' if issubclass(dtype, int) else ''}")
235241
cols = " ".join(cols)
236242

237243
command = rf"""

tests/test_sort.py

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import os
33
import subprocess
44
import sys
5-
# import pytest
5+
import pytest
66

77
testdir = os.path.dirname(os.path.realpath(__file__))
88

@@ -61,3 +61,60 @@ def test_mock_pairsam():
6161
assert int(cur_pair[3]) >= int(prev_pair[3])
6262

6363
prev_pair = cur_pair
64+
65+
66+
def test_custom_column_warning(tmpdir):
67+
"""Test that a warning is emitted when sorting with a custom column not in pairsam format."""
68+
# Create a temporary .pairsam file with custom_col in the header
69+
mock_pairsam_path = os.path.join(tmpdir, "test.pairsam")
70+
output_path = os.path.join(tmpdir, "sorted_output.pairsam")
71+
72+
# Write a minimal .pairsam file
73+
with open(mock_pairsam_path, "w") as f:
74+
f.write("## pairs format v1.0\n")
75+
f.write("#columns: readID chr1 pos1 chr2 pos2 strand1 strand2 pair_type custom_col\n")
76+
f.write("read1\tchr1\t100\tchr2\t200\t+\t-\tUU\t42\n")
77+
f.write("read2\tchr2\t150\tchr1\t250\t-\t+\tUU\t99\n")
78+
79+
# Run sort command with a custom column
80+
cmd = [
81+
"python",
82+
"-m",
83+
"pairtools",
84+
"sort",
85+
mock_pairsam_path,
86+
"--output",
87+
output_path,
88+
"--extra-col",
89+
"custom_col",
90+
]
91+
92+
# Capture stderr to check for warning
93+
process = subprocess.Popen(
94+
cmd,
95+
stderr=subprocess.PIPE,
96+
stdout=subprocess.PIPE,
97+
universal_newlines=True,
98+
)
99+
stdout, stderr = process.communicate()
100+
101+
# Check that the command completed successfully
102+
assert process.returncode == 0, f"Command failed: {stderr}"
103+
104+
# Verify warning was emitted
105+
assert "Column 'custom_col' not found in pairsam format definitions" in stderr
106+
assert "Assuming string type for sorting" in stderr
107+
108+
# Verify output file exists and has content
109+
assert os.path.exists(output_path)
110+
with open(output_path, "r") as f:
111+
output_lines = f.readlines()
112+
assert len(output_lines) > 0
113+
114+
# Check that the output is sorted (basic check on header and body)
115+
output_header = [line.strip() for line in output_lines if line.startswith("#")]
116+
output_body = [
117+
line.strip() for line in output_lines if not line.startswith("#") and line.strip()
118+
]
119+
assert any(line.startswith("#sorted") for line in output_header)
120+
assert len(output_body) > 0

0 commit comments

Comments
 (0)