Skip to content

Commit 7b5bea3

Browse files
committed
Merge branch 'main' into sqlalchemy2
2 parents c30afd8 + 35bdc2d commit 7b5bea3

File tree

7 files changed

+126
-19
lines changed

7 files changed

+126
-19
lines changed

docs/source/_static/config_schema.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/source/configuration.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
Configuration
22
-------------
33

4+
SqlSynthGen is configured using a YAML file, which is passed to several commands with the ``--config`` option.
5+
Throughout the docs, we will refer to this file as ``config.yaml`` but it can be called anything (the exception being that there will be a naming conflict if you have a vocabulary table called ``config``).
6+
7+
Below, we see the schema for the configuration file.
8+
Note that our config file format includes a section of SmartNoise SQL metadata, which is explained more fully `here <https://docs.smartnoise.org/sql/metadata.html#yaml-format>`_.
9+
410
.. raw:: html
511
:file: _static/config_schema.html

sqlsynthgen/json_schemas/config_schema.json

Lines changed: 60 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,56 +6,94 @@
66
"additionalProperties": false,
77
"properties": {
88
"use-asyncio": {
9+
"description": "Run source-statistics queries using asyncpg.",
910
"type": "boolean"
1011
},
1112
"row_generators_module": {
13+
"description": "The name of a local Python module of row generators (excluding .py).",
1214
"type": "string"
1315
},
1416
"story_generators_module": {
17+
"description": "The name of a local Python module of story generators (excluding .py).",
1518
"type": "string"
1619
},
17-
1820
"src-stats": {
21+
"description": "An array of source statistics queries.",
1922
"type": "array",
2023
"items": {
2124
"additionalProperties": false,
2225
"required": ["name", "query"],
2326
"properties": {
2427
"name": {
28+
"description": "A name for the query, which will be used in the stats file.",
2529
"type": "string"
2630
},
2731
"query": {
32+
"description": "A SQL query.",
2833
"type": "string"
2934
},
3035
"dp-query": {
36+
"description": "A SmartNoise SQL query.",
3137
"type": "string"
3238
},
3339
"epsilon": {
40+
"description": "The differential privacy epsilon value for the DP query.",
3441
"type": "number"
3542
},
3643
"delta": {
44+
"description": "The differential privacy delta value for the DP query.",
3745
"type": "number"
3846
},
39-
4047
"snsql-metadata": {
48+
"description": "See https://docs.smartnoise.org/sql/metadata.html#yaml-format.",
4149
"type": "object",
4250
"properties": {
51+
"max_ids": {
52+
"type": "integer"
53+
},
54+
"row_privacy": {
55+
"type": "boolean"
56+
},
57+
"sample_max_ids": {
58+
"type": "boolean"
59+
},
4360
"censor_dims": {
4461
"type": "boolean"
62+
},
63+
"clamp_counts": {
64+
"type": "boolean"
65+
},
66+
"clamp_columns": {
67+
"type": "boolean"
68+
},
69+
"use_dpsu": {
70+
"type": "boolean"
4571
}
4672
},
4773
"patternProperties": {
48-
"^(?!censor_dims).*$": {
74+
"^(?!(max_ids|row_privacy|sample_max_ids|censor_dims|clamp_counts|clamp_columns|use_dpsu)).*$": {
4975
"type": "object",
76+
"additionalProperties": false,
77+
"required": ["type"],
5078
"properties": {
51-
"name": {
52-
"type": "string"
53-
},
5479
"type": {
5580
"type": "string"
5681
},
5782
"private_id": {
5883
"type": "boolean"
84+
},
85+
"lower": {
86+
"type": "number"
87+
},
88+
"upper": {
89+
"type": "number"
90+
},
91+
"nullable": {
92+
"type": "boolean"
93+
},
94+
"missing_value": {},
95+
"sensitivity": {
96+
"type": "number"
5997
}
6098
}
6199
}
@@ -66,60 +104,76 @@
66104
},
67105

68106
"story_generators": {
107+
"description": "An array of story generators.",
69108
"type": "array",
70109
"items": {
71110
"type": "object",
72111
"additionalProperties": false,
73112
"required": ["name", "num_stories_per_pass"],
74113
"properties": {
75114
"name": {
115+
"description": "The full name of a story generator (e.g. my_story_generators.short_story).",
76116
"type": "string"
77117
},
78118
"args": {
119+
"description": "Positional arguments to pass to the story generator.",
79120
"type": "array"
80121
},
81122
"kwargs": {
123+
"description": "Keyword arguments to pass to the story generator.",
82124
"type": "object"
83125
},
84126
"num_stories_per_pass": {
127+
"description": "The number of times to call the story generator per pass.",
85128
"type": "integer"
86129
}
87130
}
88131
}
89132
},
90133

91134
"max-unique-constraint-tries": {
135+
"description": "The maximum number of tries to respect a uniqueness constraint.",
92136
"type": "integer"
93137
},
94138

95139
"tables": {
140+
"description": "Table configurations.",
96141
"type": "object",
97142
"patternProperties": {
98143
".*": {
144+
"description": "A table configuration.",
99145
"additionalProperties": false,
100146
"type": "object",
101147
"properties": {
102148
"vocabulary_table": {
149+
"description": "Whether to export the table data.",
103150
"type": "boolean"
104151
},
105152
"num_rows_per_pass": {
153+
"description": "The number of rows to generate per pass.",
106154
"type": "integer"
107155
},
108156
"row_generators": {
157+
"description": "An array of row generators to create column values.",
109158
"type": "array",
110159
"items": {
111160
"type": "object",
161+
"required": ["name", "columns_assigned"],
112162
"properties": {
113163
"name": {
164+
"description": "The name of a (built-in or custom) function (e.g. max or my_row_generators.my_gen).",
114165
"type": "string"
115166
},
116167
"args": {
168+
"description": "Positional arguments to pass to the function.",
117169
"type": "array"
118170
},
119171
"kwargs": {
172+
"description": "Keyword arguments to pass to the function.",
120173
"type": "object"
121174
},
122175
"columns_assigned": {
176+
"description": "One or more columns to assign the return value to.",
123177
"type": ["array", "string"],
124178
"items": {
125179
"type": "string"

sqlsynthgen/main.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,13 @@
1616
from sqlsynthgen.make import make_src_stats, make_table_generators, make_tables_file
1717
from sqlsynthgen.remove import remove_db_data, remove_db_tables, remove_db_vocab
1818
from sqlsynthgen.settings import Settings, get_settings
19-
from sqlsynthgen.utils import import_file, read_yaml_file
19+
from sqlsynthgen.utils import CONFIG_SCHEMA_PATH, import_file, read_config_file
2020

2121
# pylint: disable=too-many-arguments
2222

2323
ORM_FILENAME: Final[str] = "orm.py"
2424
SSG_FILENAME: Final[str] = "ssg.py"
2525
STATS_FILENAME: Final[str] = "src-stats.yaml"
26-
CONFIG_SCHEMA_PATH: Final[Path] = (
27-
Path(__file__).parent / "json_schemas/config_schema.json"
28-
)
2926

3027
app = Typer(no_args_is_help=True)
3128

@@ -182,7 +179,7 @@ def make_generators(
182179
_require_src_db_dsn(settings)
183180

184181
orm_module: ModuleType = import_file(orm_file)
185-
generator_config = read_yaml_file(config_file) if config_file is not None else {}
182+
generator_config = read_config_file(config_file) if config_file is not None else {}
186183
result: str = make_table_generators(
187184
orm_module, generator_config, stats_file, overwrite_files=force
188185
)
@@ -214,7 +211,7 @@ def make_stats(
214211
if not force:
215212
_check_file_non_existence(stats_file_path)
216213

217-
config = read_yaml_file(config_file) if config_file is not None else {}
214+
config = read_config_file(config_file) if config_file is not None else {}
218215

219216
settings = get_settings()
220217
src_dsn: str = _require_src_db_dsn(settings)

sqlsynthgen/utils.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,43 @@
11
"""Utility functions."""
2+
import json
23
import os
34
import sys
45
from importlib import import_module
56
from pathlib import Path
67
from types import ModuleType
7-
from typing import Any, Optional, Union
8+
from typing import Any, Final, Optional, Union
89

910
import yaml
11+
from jsonschema.exceptions import ValidationError
12+
from jsonschema.validators import validate
1013
from sqlalchemy import create_engine, event, select
1114
from sqlalchemy.ext.asyncio import create_async_engine
1215

16+
CONFIG_SCHEMA_PATH: Final[Path] = (
17+
Path(__file__).parent / "json_schemas/config_schema.json"
18+
)
1319

14-
def read_yaml_file(path: str) -> Any:
15-
"""Read a yaml file in to dictionary, given a path."""
20+
21+
def read_config_file(path: str) -> dict:
22+
"""Read a config file, warning if it is invalid.
23+
24+
Args:
25+
path: The path to a YAML-format config file.
26+
27+
Returns:
28+
The config file as a dictionary.
29+
"""
1630
with open(path, "r", encoding="utf8") as f:
1731
config = yaml.safe_load(f)
32+
33+
assert isinstance(config, dict)
34+
35+
schema_config = json.loads(CONFIG_SCHEMA_PATH.read_text(encoding="UTF-8"))
36+
try:
37+
validate(config, schema_config)
38+
except ValidationError as e:
39+
print("The config file is invalid:", e.message)
40+
1841
return config
1942

2043

tests/examples/example_config.yaml

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,22 @@ src-stats:
2525
snsql-metadata:
2626
# You may well want censor_dims to be on, but we turn it off for the
2727
# tests to silence a smartnoise-sql nag warning.
28+
max_ids: 10
29+
row_privacy: False
30+
sample_max_ids: True
2831
censor_dims: False
32+
clamp_counts: False
33+
clamp_columns: False
34+
use_dpsu: False
2935
person_id:
30-
name: person_id
3136
type: int
3237
private_id: True
38+
lower: 10
39+
upper: 100
40+
nullable: True
41+
sensitivity: 1
42+
# missing_value: breaks things
3343
research_opt_out:
34-
name: research_opt_out
3544
type: boolean
3645
private_id: False
3746

tests/test_utils.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,21 @@
11
"""Tests for the utils module."""
22
import os
33
import sys
4+
from io import StringIO
45
from pathlib import Path
6+
from unittest.mock import patch
57

68
from pydantic import PostgresDsn
79
from pydantic.tools import parse_obj_as
810
from sqlalchemy import Column, Integer, create_engine, insert
911
from sqlalchemy.orm import declarative_base
1012

11-
from sqlsynthgen.utils import create_db_engine, download_table, import_file
13+
from sqlsynthgen.utils import (
14+
create_db_engine,
15+
download_table,
16+
import_file,
17+
read_config_file,
18+
)
1219
from tests.utils import RequiresDBTestCase, SSGTestCase, run_psql
1320

1421
# pylint: disable=invalid-name
@@ -116,3 +123,14 @@ def test_connect_async(self) -> None:
116123

117124
# With schema
118125
create_db_engine(self.dsn, schema_name="public", use_asyncio=True)
126+
127+
128+
class TestReadConfig(SSGTestCase):
129+
"""Tests for the read_config_file function."""
130+
131+
def test_warns_of_invalid_config(self) -> None:
132+
"""Test that we get a warning if the config is invalid."""
133+
with patch("sys.stdout", new=StringIO()) as mock_stdout:
134+
read_config_file("tests/examples/invalid_config.yaml")
135+
136+
self.assertIn("The config file is invalid:", mock_stdout.getvalue())

0 commit comments

Comments
 (0)