-
Notifications
You must be signed in to change notification settings - Fork 9
feat(silo-import): use orjsonl to parse ndjson, only check pipeline version of first record #5764
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,13 +2,11 @@ | |
|
|
||
| from __future__ import annotations | ||
|
|
||
| import io | ||
| import json | ||
| import logging | ||
| from dataclasses import dataclass | ||
| from pathlib import Path | ||
|
|
||
| import zstandard | ||
| import orjsonl | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
@@ -18,7 +16,7 @@ class NdjsonAnalysis: | |
| """Result of analyzing an NDJSON file.""" | ||
|
|
||
| record_count: int | ||
| pipeline_versions: set[int] | ||
| pipeline_version: int | None | ||
|
|
||
|
|
||
| def analyze_ndjson(path: Path) -> NdjsonAnalysis: | ||
|
|
@@ -35,30 +33,17 @@ def analyze_ndjson(path: Path) -> NdjsonAnalysis: | |
| RuntimeError: If decompression or JSON parsing fails | ||
| """ | ||
| record_count = 0 | ||
| pipeline_versions: set[int] = set() | ||
| decompressor = zstandard.ZstdDecompressor() | ||
| pipeline_version: int | None = None | ||
|
|
||
| try: | ||
| with path.open("rb") as compressed, decompressor.stream_reader(compressed) as reader: | ||
| text_stream = io.TextIOWrapper(reader, encoding="utf-8") | ||
| for line in text_stream: | ||
| line_stripped = line.strip() | ||
| if not line_stripped: | ||
| continue | ||
| record_count += 1 | ||
| try: | ||
| obj = json.loads(line_stripped) | ||
| except json.JSONDecodeError as exc: | ||
| msg = f"Invalid JSON record: {exc}" | ||
| raise RuntimeError(msg) from exc | ||
| for record in orjsonl.stream(path): | ||
| record_count += 1 | ||
| if pipeline_version is None: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's not helping efficiency, we should count lines using a dedicated tool that's fast, like zstdcat and WC And parse the pipeline version just by reading the first line.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All other things being equal, using
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (but yeah, sorry, agree we don't actually need to parse all the JSON at all)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree that this is definitely an improvement over the current state - but potentially a subprocess with jq is more efficient |
||
| pipeline_version = record.get("metadata", {}).get("pipelineVersion") # type: ignore | ||
|
|
||
| metadata = obj.get("metadata") if isinstance(obj, dict) else None | ||
| if isinstance(metadata, dict): | ||
| pipeline_version = metadata.get("pipelineVersion") | ||
| if pipeline_version: | ||
| pipeline_versions.add(int(pipeline_version)) | ||
| except zstandard.ZstdError as exc: | ||
| except Exception as exc: | ||
| msg = f"Failed to decompress {path}: {exc}" | ||
| logger.error(msg) | ||
| raise RuntimeError(msg) from exc | ||
|
|
||
| return NdjsonAnalysis(record_count=record_count, pipeline_versions=pipeline_versions) | ||
| return NdjsonAnalysis(record_count=record_count, pipeline_version=pipeline_version) | ||
Uh oh!
There was an error while loading. Please reload this page.