diff --git a/discovery/handlers/api/schema.py b/discovery/handlers/api/schema.py index 1b1f4d71..db8f01eb 100644 --- a/discovery/handlers/api/schema.py +++ b/discovery/handlers/api/schema.py @@ -412,7 +412,7 @@ async def get(self): "validation_merge": validation_merge, "raise_on_validation_error": False, } - schema_org_version = schemas.get_schema_org_version() + schema_org_version = schemas.get_stored_schema_org_version() _kwargs = { "validator_options": validator_options, "schema_org_version": schema_org_version, diff --git a/discovery/registry/schemas.py b/discovery/registry/schemas.py index fc666969..4f4476c8 100644 --- a/discovery/registry/schemas.py +++ b/discovery/registry/schemas.py @@ -16,7 +16,7 @@ from discovery.model import Schema as ESSchemaFile from discovery.model import SchemaClass as ESSchemaClass from discovery.utils.adapters import SchemaAdapter -from discovery.utils.adapters import get_schema_org_version as _get_schema_org_version +from discovery.utils.adapters import get_latest_schema_org_version from discovery.utils.indices import get_schema_index_meta, save_schema_index_meta from .common import ConflictError, NoEntityError, RegistryDocument, RegistryError, ValidatedDict @@ -38,11 +38,12 @@ # ---------------- -def _add_schema_class(schema, namespace, dryrun=False): +def _add_schema_class(schema, namespace, dryrun=False, schema_org_version=None): assert isinstance(schema, (ValidatedDict, type(None))) - # set the stored schema.org version to load base schemas - schema_org_version = get_schema_org_version() + # set the stored schema.org version to load base schemas (unless overridden) + if schema_org_version is None: + schema_org_version = get_latest_schema_org_version() if schema is None and namespace == "schema": # handling "schema" namespace (from schema.org) differently @@ -221,7 +222,7 @@ def get(namespace): if namespace == "schema": schema = RegistryDocument(_id="schema") schema.meta.url = "https://schema.org/docs/tree.jsonld" - schema.meta.version = _get_schema_org_version() + schema.meta.version = get_latest_schema_org_version() schema["$comment"] = "internally provided by biothings.schema" schema["@context"] = {"schema": "http://schema.org/"} return schema @@ -367,10 +368,13 @@ def total(user=None): return search.count() -def add_core(update=False): +def add_core(update=False, schema_org_version=None): """add schema.org main schema.""" if not exists("schema") or update: - _add_schema_class(None, "schema") + # Use the latest schema.org version from biothings_schema when updating + if schema_org_version is None: + schema_org_version = get_latest_schema_org_version() + _add_schema_class(None, "schema", schema_org_version=schema_org_version) store_schema_org_version() @@ -526,11 +530,11 @@ def store_schema_org_version(): Make sure you call this function right after you have added the schema_org schema (e.g. after add_core is called) """ - ver = _get_schema_org_version() + ver = get_latest_schema_org_version() save_schema_index_meta({"schema_org_version": ver}) -def get_schema_org_version(): +def get_stored_schema_org_version(): """ Get the stored schema_org schema version from Schema index metadata. Return None if not found. diff --git a/discovery/utils/adapters.py b/discovery/utils/adapters.py index f0580c38..d7be8225 100644 --- a/discovery/utils/adapters.py +++ b/discovery/utils/adapters.py @@ -27,7 +27,7 @@ from biothings_schema import Schema as SchemaParser from biothings_schema.dataload import BaseSchemaLoader -from biothings_schema.dataload import get_schemaorg_version as _get_schemaorg_version +from biothings_schema.dataload import get_schemaorg_version as get_latest_schemaorg_version from discovery.registry import schemas @@ -35,9 +35,9 @@ logging.captureWarnings(True) -def get_schema_org_version(): +def get_latest_schema_org_version(): """return the current schema_org schema version""" - return _get_schemaorg_version() + return get_latest_schemaorg_version() class DDEBaseSchemaLoader(BaseSchemaLoader): diff --git a/discovery/utils/update.py b/discovery/utils/update.py index 6cb9955b..95b1c784 100644 --- a/discovery/utils/update.py +++ b/discovery/utils/update.py @@ -1,11 +1,14 @@ import logging import time +import traceback from discovery.registry import schemas +from discovery.registry.schemas import _add_schema_class from discovery.model import Schema +from discovery.utils.adapters import get_latest_schema_org_version +from discovery.registry.common import RegistryError -logging.basicConfig(level="INFO") -logger = logging.getLogger("daily-schema-update") +logger = logging.getLogger(__name__) def schema_update(namespace): """ Update registered schemas by namespace. @@ -37,3 +40,70 @@ def daily_schema_update(): schema_count += 1 total_time = time.process_time() - start logger.info(f'update process complete, total processing time was {total_time} seconds for {schema_count} schemas') + + +def monthly_schemaorg_update(): + """ Monthly schema.org schema update. + Validates the newer schema.org schema using biothings_schema package + before any update. If validation fails, DDE schema.org is not updated. + + To run manually: + from discovery.utils.update import monthly_schemaorg_update + monthly_schemaorg_update() + """ + logger.info("Starting monthly schema.org update process") + start = time.process_time() + + try: + # Get current schema.org version stored in DDE + current_version = schemas.get_stored_schema_org_version() + logger.info(f"Current schema.org version in DDE: {current_version}") + + # Get the latest available schema.org version from biothings_schema + latest_version = get_latest_schema_org_version() + logger.info(f"Latest schema.org version available: {latest_version}") + + # Check if update is needed + if current_version == latest_version: + logger.info("Schema.org is already at the latest version. No update needed.") + return + + # Validate by performing a dry-run before actual update + logger.info(f"Validating schema.org version {latest_version} (dry-run)...") + try: + class_count = _add_schema_class(None, "schema", dryrun=True, schema_org_version=latest_version) + logger.info(f"Validation passed - {class_count} schema classes validated") + except RegistryError as registry_error: + logger.error(f"Validation failed for schema.org version {latest_version}") + logger.error(f"Error type: {type(registry_error).__name__}") + logger.error(f"Error message: {registry_error}") + if hasattr(registry_error, 'status_code'): + logger.error(f"Status code: {registry_error.status_code}") + logger.debug(f"Full traceback:\n{traceback.format_exc()}") + logger.error("DDE schema.org will not be updated") + return + except AttributeError as attr_error: + # Raised from _add_schema_class when cls.full_clean() fails during validation + logger.error(f"Schema class validation failed for schema.org version {latest_version}") + logger.error(f"Error type: {type(attr_error).__name__}") + logger.error(f"Error message: {attr_error}") + logger.debug(f"Full traceback:\n{traceback.format_exc()}") + logger.error("DDE schema.org will not be updated - schema class has invalid attributes") + return + + # Validation passed - perform the actual update + logger.info(f"Updating schema.org from {current_version} to {latest_version}") + schemas.add_core(update=True, schema_org_version=latest_version) + + # Verify the update + new_version = schemas.get_stored_schema_org_version() + if new_version == latest_version: + logger.info(f"Update verified - schema.org is now at version {new_version}") + else: + logger.warning(f"Version mismatch: expected {latest_version}, got {new_version}") + + except Exception as e: + logger.error(f"Error during monthly schema.org update: {e}") + finally: + total_time = time.process_time() - start + logger.info(f'Monthly schema.org update complete, processing time: {total_time:.2f} seconds') diff --git a/index.py b/index.py index ad3a7966..9d8d6db9 100644 --- a/index.py +++ b/index.py @@ -15,7 +15,7 @@ from discovery.notify import update_n3c_routine from discovery.utils.backup import daily_backup_routine from discovery.utils.coverage import daily_coverage_update -from discovery.utils.update import daily_schema_update +from discovery.utils.update import daily_schema_update, monthly_schemaorg_update define("prod", default=False, help="Run in production mode", type=bool) define("proxy_url", default="http://localhost:3000/", help="localhost port serving frontend") @@ -102,8 +102,15 @@ def run_routine(): thread.start() +def run_monthly_schemaorg_update(): + """Run the monthly schema.org update routine in a separate thread.""" + thread = Thread(target=monthly_schemaorg_update, daemon=True) + thread.start() + + if __name__ == "__main__": options.parse_command_line() if not options.debug and options.prod: crontab("0 0 * * *", func=run_routine, start=True) # run daily at mid-night + crontab("0 2 1 * *", func=run_monthly_schemaorg_update, start=True) # run monthly on the 1st at 2 AM main(HANDLERS, use_curl=True) diff --git a/pyproject.toml b/pyproject.toml index 4b21e1b4..3c92d29d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,3 +7,9 @@ profile = "black" combine_as_imports = true line_length = 159 src_paths = ["."] + +[tool.pytest.ini_options] +markers = [ + 'monthly: marks tests as monthly update tests (skipped by default, run with "-m monthly")', +] +addopts = "-m 'not monthly'" diff --git a/tests/conftest.py b/tests/conftest.py index 09b5f4cb..d8925824 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -58,10 +58,25 @@ def index_exists_and_has_docs(es: Elasticsearch, idx: str) -> bool: def ensure_test_data(es_client): """Prepare ES indices once per test session.""" # Always restore to ensure clean state - don't skip based on existing data - print("⚠️ Restoring test data for clean state") + print("Restoring test data for clean state") restore_from_file(BACKUP_FILE) es_client.indices.refresh(index=",".join(INDEX_NAMES)) + +@pytest.fixture(scope="session") +def ensure_schema_org(ensure_test_data, es_client): + """Ensure schema.org core is loaded once per test session. + + This is separate from ensure_test_data because loading schema.org + is slow (fetches and processes thousands of classes from the network). + """ + from discovery.registry import schemas + if not schemas.exists("schema"): + print("Loading schema.org core (this may take a few minutes)...") + schemas.add_core() + es_client.indices.refresh(index=",".join(INDEX_NAMES)) + print("Schema.org core loaded") + # conftest.py (continued) @pytest.fixture(scope="module") diff --git a/tests/test_schema_registry.py b/tests/test_schema_registry.py index f46b98a1..f40ad2ea 100644 --- a/tests/test_schema_registry.py +++ b/tests/test_schema_registry.py @@ -111,7 +111,7 @@ def test_30_post(self): def test_40_schema_org_version_initially_none(self): """Test that schema.org version is None at initialization (loader sets it later)""" - version = schemas.get_schema_org_version() + version = schemas.get_stored_schema_org_version() # Version is intentionally None at initialization until the loader populates it assert version is None, "schema.org version should be None at initialization" @@ -119,7 +119,7 @@ def test_41_change_schema_version_and_add_schema(self): """Test that schemas are added using the stored schema.org version""" # Get original version (may be None at initialization) - original_version = schemas.get_schema_org_version() + original_version = schemas.get_stored_schema_org_version() # Change to a different version test_version = "15.0" @@ -127,7 +127,7 @@ def test_41_change_schema_version_and_add_schema(self): self.refresh() # Verify version was changed - current_version = schemas.get_schema_org_version() + current_version = schemas.get_stored_schema_org_version() assert current_version == test_version, f"Version should be {test_version}" # Add a new schema - _add_schema_class will pass this version to SchemaAdapter @@ -145,7 +145,7 @@ def test_41_change_schema_version_and_add_schema(self): assert count > 0, f"Schema should have classes, got count={count}" # Verify the version hasn't changed - current_version_after_add = schemas.get_schema_org_version() + current_version_after_add = schemas.get_stored_schema_org_version() assert current_version_after_add == test_version, ( f"Schema.org version should remain {test_version}, got {current_version_after_add}" ) diff --git a/tests/test_schemaorg.py b/tests/test_schemaorg.py index a939f678..5636df6e 100644 --- a/tests/test_schemaorg.py +++ b/tests/test_schemaorg.py @@ -5,11 +5,15 @@ onto the base schema loader, ensuring version consistency during validation. """ from unittest.mock import patch +import pytest +import logging + from discovery.registry import schemas -from discovery.registry.common import NoEntityError +from discovery.registry.common import NoEntityError, RegistryError from discovery.utils.indices import save_schema_index_meta, refresh -from discovery.utils.adapters import DDEBaseSchemaLoader +from discovery.utils.adapters import DDEBaseSchemaLoader, SchemaAdapter, get_latest_schema_org_version +from discovery.utils.update import monthly_schemaorg_update from biothings_schema.dataload import BaseSchemaLoader @@ -28,7 +32,6 @@ def test_biothings_schema_compatibility(self): def test_load_dde_schemas_method(self): """Test that load_dde_schemas method works""" - from discovery.utils.adapters import DDEBaseSchemaLoader loader = DDEBaseSchemaLoader(verbose=False) # Mock the schema data @@ -55,7 +58,6 @@ def test_load_dde_schemas_method(self): def test_schema_adapter_accepts_version_parameter(self): """Test that SchemaAdapter accepts schema_org_version as a parameter""" - from discovery.utils.adapters import SchemaAdapter # Create a simple test schema test_schema = { @@ -79,7 +81,6 @@ def test_schema_adapter_passes_version_to_parser(self): Test that SchemaAdapter passes schema_org_version parameter to the underlying SchemaParser, which then sets it on the base_schema_loader. """ - from discovery.utils.adapters import SchemaAdapter, DDEBaseSchemaLoader # Create a simple test schema test_schema = { @@ -105,7 +106,6 @@ def test_schema_validation_with_specific_version(self): Test that SchemaAdapter correctly passes schema_org_version parameter to ensure biothings_schema uses the specified version for validation. """ - from discovery.utils.adapters import SchemaAdapter, DDEBaseSchemaLoader # Create a test schema that uses schema.org classes test_schema = { @@ -156,7 +156,7 @@ def test_add_schema_passes_version_correctly(self, ensure_test_data): save_schema_index_meta({"schema_org_version": test_version}) # Verify it was stored - stored_version = schemas.get_schema_org_version() + stored_version = schemas.get_stored_schema_org_version() assert stored_version == test_version # Now add a test schema - it should use this version internally @@ -174,11 +174,12 @@ def test_add_schema_passes_version_correctly(self, ensure_test_data): count = schemas.add(namespace=test_namespace, url=test_url, user="test@example.com") assert count > 0, f"Schema should have classes, got count={count}" + refresh() # Verify schema was added assert schemas.exists(test_namespace) # Verify the version hasn't changed - assert schemas.get_schema_org_version() == test_version + assert schemas.get_stored_schema_org_version() == test_version finally: # Clean up @@ -189,10 +190,9 @@ def test_add_schema_passes_version_correctly(self, ensure_test_data): def test_version_stored_after_restore(self, ensure_test_data): """Test that schema.org version is stored after restore_from_file""" - from discovery.registry.schemas import get_schema_org_version # After restore (via ensure_test_data fixture), version should be stored - version = get_schema_org_version() + version = get_latest_schema_org_version() assert version is not None, "schema.org version should be stored after restore" assert isinstance(version, str), "version should be a string" assert "." in version, "version should follow format like '29.3'" @@ -214,15 +214,13 @@ def test_version_accessible_through_schema_get(self, ensure_test_data): save_schema_index_meta({"schema_org_version": version}) # Should now match stored version - stored_version = schemas.get_schema_org_version() + stored_version = schemas.get_stored_schema_org_version() assert version == stored_version def test_schema_adapter_with_stored_version(self, ensure_test_data): """Test that SchemaAdapter works when passed the stored schema.org version""" - from discovery.utils.adapters import SchemaAdapter - from discovery.registry.schemas import get_schema_org_version - stored_version = get_schema_org_version() + stored_version = get_latest_schema_org_version() assert stored_version is not None # Create a test schema with all required fields @@ -251,7 +249,6 @@ def test_schema_adapter_with_stored_version(self, ensure_test_data): def test_add_schema_with_schema_org_inheritance(self, ensure_test_data): """Test adding a schema that inherits from schema.org classes""" - from discovery.registry import schemas # Ensure schema.org core is loaded if not schemas.exists("schema"): @@ -287,9 +284,193 @@ def test_version_persists_across_operations(self, ensure_test_data): """Test that schema.org version persists across schema operations""" # Get initial version - initial_version = schemas.get_schema_org_version() + initial_version = schemas.get_stored_schema_org_version() assert initial_version is not None # Version should still be the same - current_version = schemas.get_schema_org_version() + current_version = schemas.get_stored_schema_org_version() assert current_version == initial_version + + def test_add_core_with_explicit_version(self, ensure_test_data): + """Test add_core with an explicit schema_org_version argument. + + This ensures the code path where schema_org_version is provided + (not None) works correctly without a NameError. + """ + # Use a known valid version + explicit_version = "29.0" + + # Call add_core with explicit version and update=True to force re-add + schemas.add_core(update=True, schema_org_version=explicit_version) + + # refresh() + + # Verify schema.org was added + assert schemas.exists("schema") + + # Verify classes were loaded + schema_classes = list(schemas.get_classes("schema")) + assert len(schema_classes) > 0, "schema.org should have classes" + + # Verify schema:Thing exists (a fundamental schema.org class) + thing_class = schemas.get_class("schema", "schema:Thing", raise_on_error=False) + assert thing_class is not None, "schema:Thing should exist" + + +@pytest.mark.monthly +class TestMonthlySchemaOrgUpdate: + """Tests for the monthly schema.org update functionality""" + + def test_dryrun_validation(self, ensure_schema_org): + """Test that dry-run validation works without modifying data""" + + # Get current class count (schema.org loaded via fixture) + initial_classes = list(schemas.get_classes("schema")) + initial_count = len(initial_classes) + + # Run dry-run validation + class_count = schemas._add_schema_class(None, "schema", dryrun=True) + + # Verify dry-run returns class count + assert class_count > 0, "Dry-run should return number of validated classes" + + # Verify no classes were modified (count should be same) + final_classes = list(schemas.get_classes("schema")) + assert len(final_classes) == initial_count, "Dry-run should not modify existing classes" + + def test_monthly_update_skips_when_current(self, ensure_test_data): + """Test that monthly update skips when already at latest version""" + + # Set stored version to match latest available + latest_version = get_latest_schema_org_version() + save_schema_index_meta({"schema_org_version": latest_version}) + + # Verify versions match + stored_version = schemas.get_stored_schema_org_version() + assert stored_version == latest_version + + # Run monthly update - should skip since versions match + monthly_schemaorg_update() + + # Version should remain unchanged + assert schemas.get_stored_schema_org_version() == latest_version + + def test_monthly_update_validates_before_update(self, ensure_test_data): + """Test that monthly update performs validation before updating""" + + # Store a fake old version to trigger update attempt + save_schema_index_meta({"schema_org_version": "23.9"}) + + # Get initial state + initial_version = schemas.get_stored_schema_org_version() + assert initial_version == "23.9" + + # Run monthly update + monthly_schemaorg_update() + + # Version should be updated to latest + new_version = schemas.get_stored_schema_org_version() + latest_version = get_latest_schema_org_version() + assert new_version == latest_version, f"Expected {latest_version}, got {new_version}" + + def test_monthly_update_handles_registry_error(self, ensure_test_data): + """Test that monthly update handles RegistryError during validation""" + + # Set an old version to trigger the update path + save_schema_index_meta({"schema_org_version": "23.9"}) + + # Mock _add_schema_class to raise RegistryError during dry-run + with patch('discovery.utils.update._add_schema_class') as mock_add: + mock_add.side_effect = RegistryError("Validation failed: invalid schema") + + # Should not raise - error should be caught and logged + monthly_schemaorg_update() + + # Verify dry-run was attempted + mock_add.assert_called_once() + # Verify dryrun=True was passed + call_kwargs = mock_add.call_args[1] + assert call_kwargs.get('dryrun') is True + + # Version should remain unchanged (update was aborted) + assert schemas.get_stored_schema_org_version() == "23.9" + + def test_monthly_update_logs_error_type(self, ensure_test_data, caplog): + """Test that monthly update logs the specific error type""" + + # Set an old version to trigger the update path + save_schema_index_meta({"schema_org_version": "23.0"}) + + with caplog.at_level(logging.ERROR): + with patch('discovery.utils.update._add_schema_class') as mock_add: + mock_add.side_effect = RegistryError("Test error message") + monthly_schemaorg_update() + + # Verify error type is logged + assert any("Error type: RegistryError" in record.message for record in caplog.records) + assert any("Error message: Test error message" in record.message for record in caplog.records) + + def test_monthly_update_logs_subclass_error_type(self, ensure_test_data, caplog): + """Test that monthly update distinguishes between RegistryError subclasses""" + + # Set an old version to trigger the update path + save_schema_index_meta({"schema_org_version": "23.0"}) + + with caplog.at_level(logging.ERROR): + with patch('discovery.utils.update._add_schema_class') as mock_add: + mock_add.side_effect = NoEntityError("Entity not found") + monthly_schemaorg_update() + + # Verify specific subclass type is logged (not just RegistryError) + assert any("Error type: NoEntityError" in record.message for record in caplog.records) + + def test_monthly_update_logs_status_code_when_present(self, ensure_test_data, caplog): + """Test that monthly update logs status_code if present on error""" + + # Set an old version to trigger the update path + save_schema_index_meta({"schema_org_version": "23.0"}) + + with caplog.at_level(logging.ERROR): + with patch('discovery.utils.update._add_schema_class') as mock_add: + error = RegistryError("HTTP error occurred") + error.status_code = 404 + mock_add.side_effect = error + monthly_schemaorg_update() + + # Verify status code is logged + assert any("Status code: 404" in record.message for record in caplog.records) + + def test_monthly_update_logs_traceback_at_debug(self, caplog): + """Test that monthly update logs full traceback at DEBUG level""" + + # Set an old version to trigger the update path + save_schema_index_meta({"schema_org_version": "23.0"}) + + with caplog.at_level(logging.DEBUG): + with patch('discovery.utils.update._add_schema_class') as mock_add: + mock_add.side_effect = RegistryError("Traceback test error") + monthly_schemaorg_update() + + # Verify traceback is logged at DEBUG level + debug_records = [r for r in caplog.records if r.levelno == logging.DEBUG] + assert any("Full traceback:" in record.message for record in debug_records) + assert any("RegistryError" in record.message for record in debug_records) + + def test_monthly_update_handles_attribute_error(self, caplog): + """Test that monthly update handles AttributeError from schema class validation""" + + # Set an old version to trigger the update path + save_schema_index_meta({"schema_org_version": "23.0"}) + + with caplog.at_level(logging.ERROR): + with patch('discovery.utils.update._add_schema_class') as mock_add: + # AttributeError is raised when cls.full_clean() fails in _add_schema_class + mock_add.side_effect = AttributeError("'NoneType' object has no attribute 'name'") + monthly_schemaorg_update() + + # Verify error type is logged + assert any("Error type: AttributeError" in record.message for record in caplog.records) + assert any("invalid attributes" in record.message for record in caplog.records) + + # Version should remain unchanged (update was aborted) + assert schemas.get_stored_schema_org_version() == "23.0"