Skip to content

Commit e6de343

Browse files
authored
Added --Streaming option for very large csv files
1 parent 2ab8e12 commit e6de343

File tree

2 files changed

+105
-12
lines changed

2 files changed

+105
-12
lines changed

db_diff/__init__.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,76 @@ def compare(previous, current, show_unchanged=False, fields=None, ignorefields=N
123123
return result
124124

125125

126+
def streaming_compare_csv(prev_path, curr_path, key, compare_columns=None, ignorefields=None, encoding='utf-8', dialect='excel'):
127+
"""
128+
Compare two sorted CSV files by streaming, returning a diff dict.
129+
"""
130+
import csv
131+
result = {
132+
"added": [],
133+
"removed": [],
134+
"changed": [],
135+
"columns_added": [],
136+
"columns_removed": [],
137+
}
138+
139+
with open(prev_path, newline='', encoding=encoding) as f1, open(curr_path, newline='', encoding=encoding) as f2:
140+
reader1 = csv.DictReader(f1, dialect=dialect)
141+
reader2 = csv.DictReader(f2, dialect=dialect)
142+
prev_row = next(reader1, None)
143+
curr_row = next(reader2, None)
144+
145+
prev_columns = set(reader1.fieldnames)
146+
curr_columns = set(reader2.fieldnames)
147+
148+
# Determine columns to compare
149+
if compare_columns:
150+
compare_columns = set(compare_columns)
151+
elif ignorefields:
152+
compare_columns = (prev_columns | curr_columns) - set(ignorefields)
153+
else:
154+
compare_columns = prev_columns | curr_columns
155+
156+
result["columns_added"] = [c for c in curr_columns if c not in prev_columns and c in compare_columns]
157+
result["columns_removed"] = [c for c in prev_columns if c not in curr_columns and c in compare_columns]
158+
159+
while prev_row or curr_row:
160+
if prev_row and curr_row:
161+
if key not in prev_row or key not in curr_row:
162+
raise KeyError(f"Key column '{key}' missing in one of the rows.")
163+
prev_key = str(prev_row[key])
164+
curr_key = str(curr_row[key])
165+
if prev_key == curr_key:
166+
# Check for changes
167+
changed_fields = {
168+
col: [prev_row.get(col), curr_row.get(col)]
169+
for col in compare_columns
170+
if prev_row.get(col) != curr_row.get(col)
171+
}
172+
if changed_fields:
173+
result["changed"].append({
174+
"key": prev_key,
175+
"changes": changed_fields
176+
})
177+
prev_row = next(reader1, None)
178+
curr_row = next(reader2, None)
179+
elif prev_key < curr_key:
180+
# Row removed
181+
result["removed"].append(prev_row)
182+
prev_row = next(reader1, None)
183+
else:
184+
# Row added
185+
result["added"].append(curr_row)
186+
curr_row = next(reader2, None)
187+
elif prev_row:
188+
result["removed"].append(prev_row)
189+
prev_row = next(reader1, None)
190+
elif curr_row:
191+
result["added"].append(curr_row)
192+
curr_row = next(reader2, None)
193+
return result
194+
195+
126196
def human_text(result, key=None, current=None, extras=None):
127197
title = []
128198
summary = []

db_diff/cli.py

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import time as time_module
44
import os
55
import datetime
6-
from . import load_csv, load_json, compare, human_text
6+
from . import load_csv, load_json, compare, human_text, streaming_compare_csv
77

88
@click.command(
99
context_settings={"help_option_names": ["-h", "--help"]},
@@ -81,11 +81,16 @@
8181
show_default=True,
8282
help="Input File Encoding. Available: (utf-8|utf-16|utf-16le|utf-16be|latin1|cp1252|ascii|...).",
8383
)
84+
@click.option(
85+
"--streaming",
86+
is_flag=True,
87+
help="Use streaming mode for very large CSV/TSV files (requires files to be sorted by key).",
88+
)
8489
@click.version_option()
8590
def cli(
8691
previous, current, key, input_format,
8792
show_unchanged, encoding, show_time, output, output_file, output_path,
88-
fields, ignorefields
93+
fields, ignorefields, streaming
8994
):
9095
"""Compare the differences between two CSV or JSON files."""
9196
dialect = {
@@ -148,16 +153,34 @@ def load(filename, key):
148153
fields_set = set(f.strip() for f in fields.split(",")) if fields else None
149154
ignorefields_set = set(f.strip() for f in ignorefields.split(",")) if ignorefields else None
150155

151-
previous_data = load(previous, key)
152-
current_data = load(current, key)
153-
diff = compare(
154-
previous_data,
155-
current_data,
156-
show_unchanged,
157-
fields=fields_set,
158-
ignorefields=ignorefields_set,
159-
)
160-
# test edit
156+
# --- Streaming logic ---
157+
if streaming and (input_format in ("csv", "tsv", None)):
158+
# Default to csv if not specified
159+
fmt = input_format or previous.split(".")[-1].lower()
160+
if fmt not in ("csv", "tsv"):
161+
raise click.ClickException("--streaming only works with CSV/TSV files.")
162+
diff = streaming_compare_csv(
163+
previous,
164+
current,
165+
key=key,
166+
compare_columns=fields_set,
167+
ignorefields=ignorefields_set,
168+
encoding=encoding,
169+
dialect=dialect.get(fmt, "excel"),
170+
)
171+
# For human_text, we need current_data for extras (if used)
172+
current_data = None
173+
else:
174+
previous_data = load(previous, key)
175+
current_data = load(current, key)
176+
diff = compare(
177+
previous_data,
178+
current_data,
179+
show_unchanged,
180+
fields=fields_set,
181+
ignorefields=ignorefields_set,
182+
)
183+
161184
if output == "json":
162185
print(std_json.dumps(diff, indent=4))
163186
elif output == "jsonfile":

0 commit comments

Comments
 (0)