Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion discovery/handlers/api/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,7 @@ async def get(self):
"validation_merge": validation_merge,
"raise_on_validation_error": False,
}
schema_org_version = schemas.get_schema_org_version()
schema_org_version = schemas.get_stored_schema_org_version()
_kwargs = {
"validator_options": validator_options,
"schema_org_version": schema_org_version,
Expand Down
22 changes: 13 additions & 9 deletions discovery/registry/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from discovery.model import Schema as ESSchemaFile
from discovery.model import SchemaClass as ESSchemaClass
from discovery.utils.adapters import SchemaAdapter
from discovery.utils.adapters import get_schema_org_version as _get_schema_org_version
from discovery.utils.adapters import get_latest_schema_org_version
from discovery.utils.indices import get_schema_index_meta, save_schema_index_meta

from .common import ConflictError, NoEntityError, RegistryDocument, RegistryError, ValidatedDict
Expand All @@ -38,11 +38,12 @@
# ----------------


def _add_schema_class(schema, namespace, dryrun=False):
def _add_schema_class(schema, namespace, dryrun=False, schema_org_version=None):
assert isinstance(schema, (ValidatedDict, type(None)))

# set the stored schema.org version to load base schemas
schema_org_version = get_schema_org_version()
# set the stored schema.org version to load base schemas (unless overridden)
if schema_org_version is None:
schema_org_version = get_latest_schema_org_version()

if schema is None and namespace == "schema":
# handling "schema" namespace (from schema.org) differently
Expand Down Expand Up @@ -221,7 +222,7 @@ def get(namespace):
if namespace == "schema":
schema = RegistryDocument(_id="schema")
schema.meta.url = "https://schema.org/docs/tree.jsonld"
schema.meta.version = _get_schema_org_version()
schema.meta.version = get_latest_schema_org_version()
schema["$comment"] = "internally provided by biothings.schema"
schema["@context"] = {"schema": "http://schema.org/"}
return schema
Expand Down Expand Up @@ -367,10 +368,13 @@ def total(user=None):
return search.count()


def add_core(update=False):
def add_core(update=False, schema_org_version=None):
"""add schema.org main schema."""
if not exists("schema") or update:
_add_schema_class(None, "schema")
# Use the latest schema.org version from biothings_schema when updating
if schema_org_version is None:
schema_org_version = get_latest_schema_org_version()
_add_schema_class(None, "schema", schema_org_version=schema_org_version)
store_schema_org_version()


Expand Down Expand Up @@ -526,11 +530,11 @@ def store_schema_org_version():
Make sure you call this function right after you have added the schema_org schema
(e.g. after add_core is called)
"""
ver = _get_schema_org_version()
ver = get_latest_schema_org_version()
save_schema_index_meta({"schema_org_version": ver})


def get_schema_org_version():
def get_stored_schema_org_version():
"""
Get the stored schema_org schema version from Schema index metadata.
Return None if not found.
Expand Down
6 changes: 3 additions & 3 deletions discovery/utils/adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,17 @@

from biothings_schema import Schema as SchemaParser
from biothings_schema.dataload import BaseSchemaLoader
from biothings_schema.dataload import get_schemaorg_version as _get_schemaorg_version
from biothings_schema.dataload import get_schemaorg_version as get_latest_schemaorg_version

from discovery.registry import schemas

# the underlying package uses warnings
logging.captureWarnings(True)


def get_schema_org_version():
def get_latest_schema_org_version():
"""return the current schema_org schema version"""
return _get_schemaorg_version()
return get_latest_schemaorg_version()


class DDEBaseSchemaLoader(BaseSchemaLoader):
Expand Down
74 changes: 72 additions & 2 deletions discovery/utils/update.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import logging
import time
import traceback

from discovery.registry import schemas
from discovery.registry.schemas import _add_schema_class
from discovery.model import Schema
from discovery.utils.adapters import get_latest_schema_org_version
from discovery.registry.common import RegistryError

logging.basicConfig(level="INFO")
logger = logging.getLogger("daily-schema-update")
logger = logging.getLogger(__name__)

def schema_update(namespace):
""" Update registered schemas by namespace.
Expand Down Expand Up @@ -37,3 +40,70 @@ def daily_schema_update():
schema_count += 1
total_time = time.process_time() - start
logger.info(f'update process complete, total processing time was {total_time} seconds for {schema_count} schemas')


def monthly_schemaorg_update():
""" Monthly schema.org schema update.
Validates the newer schema.org schema using biothings_schema package
before any update. If validation fails, DDE schema.org is not updated.

To run manually:
from discovery.utils.update import monthly_schemaorg_update
monthly_schemaorg_update()
"""
logger.info("Starting monthly schema.org update process")
start = time.process_time()

try:
# Get current schema.org version stored in DDE
current_version = schemas.get_stored_schema_org_version()
logger.info(f"Current schema.org version in DDE: {current_version}")

# Get the latest available schema.org version from biothings_schema
latest_version = get_latest_schema_org_version()
logger.info(f"Latest schema.org version available: {latest_version}")

# Check if update is needed
if current_version == latest_version:
logger.info("Schema.org is already at the latest version. No update needed.")
return

# Validate by performing a dry-run before actual update
logger.info(f"Validating schema.org version {latest_version} (dry-run)...")
try:
class_count = _add_schema_class(None, "schema", dryrun=True, schema_org_version=latest_version)
logger.info(f"Validation passed - {class_count} schema classes validated")
except RegistryError as registry_error:
logger.error(f"Validation failed for schema.org version {latest_version}")
logger.error(f"Error type: {type(registry_error).__name__}")
logger.error(f"Error message: {registry_error}")
if hasattr(registry_error, 'status_code'):
logger.error(f"Status code: {registry_error.status_code}")
logger.debug(f"Full traceback:\n{traceback.format_exc()}")
logger.error("DDE schema.org will not be updated")
return
except AttributeError as attr_error:
# Raised from _add_schema_class when cls.full_clean() fails during validation
logger.error(f"Schema class validation failed for schema.org version {latest_version}")
logger.error(f"Error type: {type(attr_error).__name__}")
logger.error(f"Error message: {attr_error}")
logger.debug(f"Full traceback:\n{traceback.format_exc()}")
logger.error("DDE schema.org will not be updated - schema class has invalid attributes")
return

# Validation passed - perform the actual update
logger.info(f"Updating schema.org from {current_version} to {latest_version}")
schemas.add_core(update=True, schema_org_version=latest_version)

# Verify the update
new_version = schemas.get_stored_schema_org_version()
if new_version == latest_version:
logger.info(f"Update verified - schema.org is now at version {new_version}")
else:
logger.warning(f"Version mismatch: expected {latest_version}, got {new_version}")

except Exception as e:
logger.error(f"Error during monthly schema.org update: {e}")
finally:
total_time = time.process_time() - start
logger.info(f'Monthly schema.org update complete, processing time: {total_time:.2f} seconds')
9 changes: 8 additions & 1 deletion index.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from discovery.notify import update_n3c_routine
from discovery.utils.backup import daily_backup_routine
from discovery.utils.coverage import daily_coverage_update
from discovery.utils.update import daily_schema_update
from discovery.utils.update import daily_schema_update, monthly_schemaorg_update

define("prod", default=False, help="Run in production mode", type=bool)
define("proxy_url", default="http://localhost:3000/", help="localhost port serving frontend")
Expand Down Expand Up @@ -102,8 +102,15 @@ def run_routine():
thread.start()


def run_monthly_schemaorg_update():
"""Run the monthly schema.org update routine in a separate thread."""
thread = Thread(target=monthly_schemaorg_update, daemon=True)
thread.start()


if __name__ == "__main__":
options.parse_command_line()
if not options.debug and options.prod:
crontab("0 0 * * *", func=run_routine, start=True) # run daily at mid-night
crontab("0 2 1 * *", func=run_monthly_schemaorg_update, start=True) # run monthly on the 1st at 2 AM
main(HANDLERS, use_curl=True)
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,9 @@ profile = "black"
combine_as_imports = true
line_length = 159
src_paths = ["."]

[tool.pytest.ini_options]
markers = [
'monthly: marks tests as monthly update tests (skipped by default, run with "-m monthly")',
]
addopts = "-m 'not monthly'"
17 changes: 16 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,25 @@ def index_exists_and_has_docs(es: Elasticsearch, idx: str) -> bool:
def ensure_test_data(es_client):
"""Prepare ES indices once per test session."""
# Always restore to ensure clean state - don't skip based on existing data
print("⚠️ Restoring test data for clean state")
print("Restoring test data for clean state")
restore_from_file(BACKUP_FILE)
es_client.indices.refresh(index=",".join(INDEX_NAMES))


@pytest.fixture(scope="session")
def ensure_schema_org(ensure_test_data, es_client):
"""Ensure schema.org core is loaded once per test session.

This is separate from ensure_test_data because loading schema.org
is slow (fetches and processes thousands of classes from the network).
"""
from discovery.registry import schemas
if not schemas.exists("schema"):
print("Loading schema.org core (this may take a few minutes)...")
schemas.add_core()
es_client.indices.refresh(index=",".join(INDEX_NAMES))
print("Schema.org core loaded")

# conftest.py (continued)

@pytest.fixture(scope="module")
Expand Down
8 changes: 4 additions & 4 deletions tests/test_schema_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,23 +111,23 @@ def test_30_post(self):

def test_40_schema_org_version_initially_none(self):
"""Test that schema.org version is None at initialization (loader sets it later)"""
version = schemas.get_schema_org_version()
version = schemas.get_stored_schema_org_version()
# Version is intentionally None at initialization until the loader populates it
assert version is None, "schema.org version should be None at initialization"

def test_41_change_schema_version_and_add_schema(self):
"""Test that schemas are added using the stored schema.org version"""

# Get original version (may be None at initialization)
original_version = schemas.get_schema_org_version()
original_version = schemas.get_stored_schema_org_version()

# Change to a different version
test_version = "15.0"
indices.save_schema_index_meta({"schema_org_version": test_version})
self.refresh()

# Verify version was changed
current_version = schemas.get_schema_org_version()
current_version = schemas.get_stored_schema_org_version()
assert current_version == test_version, f"Version should be {test_version}"

# Add a new schema - _add_schema_class will pass this version to SchemaAdapter
Expand All @@ -145,7 +145,7 @@ def test_41_change_schema_version_and_add_schema(self):
assert count > 0, f"Schema should have classes, got count={count}"

# Verify the version hasn't changed
current_version_after_add = schemas.get_schema_org_version()
current_version_after_add = schemas.get_stored_schema_org_version()
assert current_version_after_add == test_version, (
f"Schema.org version should remain {test_version}, got {current_version_after_add}"
)
Expand Down
Loading