Skip to content

Commit 9146f8f

Browse files
authored
Merge pull request #2 from HPI-Information-Systems:feat/reference-data
Support loading reference data
2 parents 5a34b23 + 6703128 commit 9146f8f

File tree

6 files changed

+77
-47
lines changed

6 files changed

+77
-47
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@ out/
99
.idea/
1010
*.pyc
1111
__pycache__/
12+
.venv/

demo/getting_started.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from metis.dq_orchestrator import DQOrchestrator
22

33
# No config file means default to console writer
4-
orchestrator = DQOrchestrator(writer_config="configs/writer/sqlite.json")
4+
orchestrator = DQOrchestrator(writer_config_path="configs/writer/sqlite.json")
55

66
orchestrator.load(data_loader_configs=["data/adult.json"])
77

metis/dq_orchestrator.py

Lines changed: 38 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,29 @@
1-
from typing import List
2-
import pandas as pd
31
import json
4-
import os
5-
import sqlite3
2+
from typing import Dict, List, Type
3+
4+
import pandas as pd
65

7-
import metis.globals
6+
from metis.loader.csv_loader import CSVLoader
87
from metis.metric import Metric
98
from metis.utils.data_config import DataConfig
109
from metis.utils.result import DQResult
11-
from metis.loader.csv_loader import CSVLoader
12-
from metis.writer.sqlite_writer import SQLiteWriter
13-
from metis.writer.postgres_writer import PostgresWriter
1410
from metis.writer.console_writer import ConsoleWriter
11+
from metis.writer.postgres_writer import PostgresWriter
12+
from metis.writer.sqlite_writer import SQLiteWriter
13+
1514

1615
class DQOrchestrator:
17-
def __init__(self, writer_config=None) -> None:
18-
self.dataframes = {}
19-
self.data_paths = {}
20-
self.results = {} #TODO: Decide what to do with these in memory results
16+
def __init__(self, writer_config_path: str | None = None) -> None:
17+
self.dataframes: Dict[str, pd.DataFrame] = {}
18+
self.reference_dataframes: Dict[str, pd.DataFrame] = {}
19+
self.data_paths: Dict[str, str] = {}
20+
self.results: Dict[str, DQResult] = (
21+
{}
22+
) # TODO: Decide what to do with these in memory results
2123

2224
self.writer = ConsoleWriter({})
23-
if writer_config:
24-
with open(writer_config, 'r') as f:
25+
if writer_config_path:
26+
with open(writer_config_path, "r") as f:
2527
writer_config = json.load(f)
2628
if not "writer_name" in writer_config:
2729
raise ValueError("Writer config must include 'writer_name' field.")
@@ -32,36 +34,46 @@ def __init__(self, writer_config=None) -> None:
3234

3335
def load(self, data_loader_configs: List[str]) -> None:
3436
for config_path in data_loader_configs:
35-
with open(config_path, 'r') as f:
37+
with open(config_path, "r") as f:
3638
config_data = json.load(f)
3739
config = DataConfig(config_data)
38-
config.file_name = os.path.join(metis.globals.data_root, config.file_name)
40+
3941
if config.loader == "CSV":
4042
loader = CSVLoader()
4143
dataframe = loader.load(config)
4244
self.dataframes[config.name] = dataframe
4345
self.data_paths[config.name] = config_path
4446

47+
if config.reference_file_name:
48+
reference_config = DataConfig(config_data)
49+
reference_config.file_name = config.reference_file_name
50+
reference_dataframe = loader.load(reference_config)
51+
self.reference_dataframes[config.name] = reference_dataframe
4552
else:
46-
raise ValueError(f"Unsupported loader type: {config_data.get('loader', None)}")
47-
48-
def assess(self, metrics: List[str], metric_configs: List[str]) -> None:
53+
raise ValueError(
54+
f"Unsupported loader type: {config_data.get('loader', None)}"
55+
)
56+
57+
def assess(self, metrics: List[str], metric_configs: List[str | None]) -> None:
4958
results = []
50-
59+
5160
for metric, metric_config in zip(metrics, metric_configs):
52-
metric_class = Metric.registry.get(metric)
61+
metric_class: Type[Metric] | None = Metric.registry.get(metric)
5362
if not metric_class:
5463
raise ValueError(f"Metric {metric} is not registered.")
55-
metric_instance = metric_class()
64+
metric_instance: Metric = metric_class()
5665
for df_name, df in self.dataframes.items():
57-
incomplete_metric_results = metric_instance.assess(df, metric_config=metric_config) #TODO: Add reference data support
66+
incomplete_metric_results = metric_instance.assess(
67+
data=df,
68+
reference=self.reference_dataframes.get(df_name),
69+
metric_config=metric_config,
70+
)
5871
for result in incomplete_metric_results:
5972
result.tableName = df_name
6073
result.dataset = self.data_paths[df_name]
6174
results.append(result)
6275

6376
self.writer.write(results)
6477

65-
66-
def getDQResult(query: str) -> List[DQResult]:
67-
pass
78+
def get_dq_result(self, query: str) -> List[DQResult]:
79+
return []

metis/loader/csv_loader.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,13 @@
33
from metis.loader.loader import DataLoader
44
from metis.utils.data_config import DataConfig
55

6+
67
class CSVLoader(DataLoader):
78
def load(self, config: DataConfig) -> pd.DataFrame:
89
"""
910
Load data from a CSV file specified by the config.
10-
11-
:param config: Path to the CSV file.
11+
12+
:param config: DataConfig object containing the CSV parsing details.
1213
:return: DataFrame containing the loaded data.
1314
"""
1415

@@ -22,4 +23,4 @@ def load(self, config: DataConfig) -> pd.DataFrame:
2223
parse_dates=config.parse_dates,
2324
decimal=config.decimals,
2425
thousands=config.thousands
25-
)
26+
)

metis/loader/loader.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
11
from abc import ABC, abstractmethod
22
import pandas as pd
33

4+
from metis.utils.data_config import DataConfig
5+
46
class DataLoader(ABC):
57
@abstractmethod
6-
def load(self, config: str) -> pd.DataFrame:
8+
def load(self, config: DataConfig) -> pd.DataFrame:
79
"""
810
Load data from a source defined by the config.
9-
10-
:param config: Configuration string or path to the configuration file.
11+
12+
:param config: Configuration object containing the data source details.
1113
:return: DataFrame containing the loaded data.
1214
"""
13-
pass
15+
pass

metis/utils/data_config.py

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,33 @@
1-
import json
2-
from typing import Dict
1+
from pathlib import Path
2+
from typing import Dict, List
3+
4+
import metis.globals
5+
36

47
class DataConfig:
58
def __init__(
69
self,
710
config: Dict,
811
):
9-
self.name = config.get('name')
10-
self.file_name = config.get('file_name')
11-
self.loader = config.get('loader')
12-
self.delimiter = config.get('delimiter', ',')
13-
self.encoding = config.get('encoding', 'utf-8')
14-
self.header = config.get('header', 0)
15-
self.nrows = config.get('nrows', None)
16-
self.usecols = config.get('usecols', None)
17-
self.parse_dates = config.get('parse_dates', False)
18-
self.decimals = config.get('decimals', ".")
19-
self.thousands = config.get('thousands', None)
12+
if "file_name" not in config:
13+
raise ValueError(f"Data config must include 'file_name' field.")
14+
if "name" not in config:
15+
raise ValueError(f"Data config must include 'name' field.")
16+
self.name: str = config["name"]
17+
self.file_name: str = Path(metis.globals.data_root) / config["file_name"]
18+
self.reference_file_name: str | None = (
19+
Path(metis.globals.data_root) / config["reference_file_name"]
20+
if config.get("reference_file_name")
21+
else None
22+
)
23+
self.loader: str | None = config.get("loader")
24+
self.delimiter: str = config.get("delimiter", ",")
25+
self.encoding: str = config.get("encoding", "utf-8")
26+
self.header: int = config.get("header", 0)
27+
self.nrows: int | None = config.get("nrows")
28+
self.usecols: List[str] | None = config.get("usecols")
29+
self.parse_dates: bool = config.get("parse_dates", False)
30+
self.decimals: str = config.get("decimals", ".")
31+
self.thousands: str | None = config.get("thousands")
32+
self.decimals: str = config.get("decimals", ".")
33+
self.thousands: str | None = config.get("thousands")

0 commit comments

Comments
 (0)