From 9809e39ac21f8a4bc9e301f399f6d35c0d91279d Mon Sep 17 00:00:00 2001 From: Matt Bertrand Date: Wed, 6 Aug 2025 20:16:19 -0400 Subject: [PATCH 1/5] Assign urls to edx contentfiles when possible --- learning_resources/etl/utils.py | 154 +++++++++++++++++++++- learning_resources_search/indexing_api.py | 1 + vector_search/tasks.py | 1 + 3 files changed, 153 insertions(+), 3 deletions(-) diff --git a/learning_resources/etl/utils.py b/learning_resources/etl/utils.py index 2e9058e261..e4e708241b 100644 --- a/learning_resources/etl/utils.py +++ b/learning_resources/etl/utils.py @@ -18,10 +18,12 @@ from pathlib import Path from subprocess import check_call from tempfile import TemporaryDirectory +from typing import Optional import boto3 import rapidjson import requests +from defusedxml.ElementTree import ParseError, parse from django.conf import settings from django.utils.dateparse import parse_duration from django.utils.text import slugify @@ -341,7 +343,7 @@ def documents_from_olx( "mime_type": mimetype, "archive_checksum": archive_checksum, "file_extension": extension_lower, - "source_path": f"{path}/{filename}", + "source_path": f"{path}/{filename.replace(' ', '_')}", }, ) @@ -407,7 +409,150 @@ def text_from_sjson_content(content: str): return " ".join(data.get("text", [])) +def get_root_url_for_source(etl_source: str) -> tuple[str, str]: + """ + Get the base URL and path for an ETL source + + Args: + etl_source (str): The ETL source path + + Returns: + tuple[str, str]: The base URL and path + """ + mapping = { + ETLSource.mitxonline.value: "https://courses.mitxonline.mit.edu", + ETLSource.xpro.value: "https://courses.xpro.mit.edu", + ETLSource.mit_edx.value: "https://www.edx.org", + ETLSource.oll.value: "https://openlearninglibrary.mit.edu", + } + return mapping.get(etl_source) + + +def is_valid_uuid(uuid_string: str) -> bool: + """ + Check if a string is a valid UUID + """ + try: + uuid.UUID(uuid_string) + except ValueError: + return False + return True + + +def get_url_from_module_id( + olx_path: str, + module_id: str, + run: LearningResourceRun, + assets_metadata: Optional[dict] = None, + video_srt_metadata: Optional[dict] = None, +) -> str: + """ + Get the URL for a module based on its ID + + Args: + module_id (str): The module ID + run (LearningResourceRun): The run associated with the module + + Returns: + str: The URL for the module + """ + if not module_id: + log.warning("Module ID is empty") + return None + root_url = get_root_url_for_source(run.learning_resource.etl_source) + with Path.open("video_metadata.json", "w") as f: + json.dump(video_srt_metadata, f, indent=2) + if module_id.startswith("asset"): + log.info("Getting URL for asset %s", module_id) + asset_meta = ( + assets_metadata.get(Path(olx_path).parts[-1], {}) if assets_metadata else {} + ) + video_meta = video_srt_metadata.get(module_id, {}) if video_srt_metadata else {} + if video_meta: + log.info("Found video metadata for %s", module_id) + return f"{root_url}/xblock/{video_meta}" + elif module_id.endswith(".srt"): + log.info("NO VIDEO METADATA FOR %s", module_id) + middle_path = asset_meta.get("custom_md5", "") + return f"{root_url}/{(middle_path + '/') if middle_path else ''}{module_id}" + elif module_id.startswith("block") and is_valid_uuid(module_id.split("@")[-1]): + return f"{root_url}/xblock/{module_id}" + else: + log.warning("Unknown module ID format: %s", module_id) + return None + + +def get_assets_metadata(olx_path: str) -> dict: + """ + Get metadata for assets in an OLX path + + Args: + olx_path (str): The path to the OLX directory + """ + try: + with Path.open(Path(olx_path, "policies/assets.json"), "rb") as f: + return json.loads(f.read()) + except FileNotFoundError: + log.warning("Assets metadata file does not exist: %s", olx_path) + + +def parse_video_transcripts_xml( + run: LearningResourceRun, xml_content: str, path: Path +) -> dict: + """ + Parse video XML content and create a mapping of + transcript edx_module_id to video edx_module_id + """ + transcript_mapping = {} + try: + root = parse(xml_content) + + # Get the video url_name from the root video element + video_url_name = root.get("url_name") + if not video_url_name: + log.warning("No url_name found in video XML") + return {} + + # Find all transcript elements and extract their src attributes + for transcript in root.findall(".//transcript"): + transcript_src = transcript.get("src") + if transcript_src: + transcript_mapping[ + get_edx_module_id(f"static/{transcript_src}", run) + ] = get_edx_module_id(str(path), run) + except ParseError: + log.exception("Error parsing video XML for %s: %s", run, path) + return transcript_mapping + + +def get_video_metadata(olx_path: str, run: LearningResourceRun) -> dict: + """ + Get metadata for video SRT files in an OLX path + """ + video_transcript_mapping = {} + video_path = Path(olx_path, "video") + if not video_path.exists(): + log.warning("No video directory found in OLX path: %s", olx_path) + return video_transcript_mapping + for root, _, files in os.walk(str(Path(olx_path, "video"))): + path = "/".join(root.split("/")[3:]) + for filename in files: + log.info("Processing video file %s in %s", filename, path) + extension_lower = Path(filename).suffix.lower() + if extension_lower == ".xml": + with Path.open(Path(root, filename), "rb") as f: + video_xml = f.read().decode("utf-8") + + # Parse the XML and get transcript mappings + transcript_mapping = parse_video_transcripts_xml(run, video_xml, f) + video_transcript_mapping.update(transcript_mapping) + + return video_transcript_mapping + + def _process_olx_path(olx_path: str, run: LearningResourceRun, *, overwrite): + assets_metadata = get_assets_metadata(olx_path) + video_srt_metadata = get_video_metadata(olx_path, run) for document, metadata in documents_from_olx(olx_path): source_path = metadata.get("source_path") edx_module_id = get_edx_module_id(source_path, run) @@ -465,6 +610,9 @@ def _process_olx_path(olx_path: str, run: LearningResourceRun, *, overwrite): "file_extension": file_extension, "source_path": source_path, "edx_module_id": edx_module_id, + "url": get_url_from_module_id( + source_path, edx_module_id, run, assets_metadata, video_srt_metadata + ), **content_dict, } ) @@ -741,7 +889,7 @@ def parse_certification(offeror, runs_data): ) -def iso8601_duration(duration_str: str) -> str or None: +def iso8601_duration(duration_str: str) -> str | None: """ Parse the duration from a string and return it in ISO-8601 format @@ -821,7 +969,7 @@ def calculate_weeks(num: int, from_unit: str) -> int: return num -def transform_interval(interval_txt: str) -> str or None: +def transform_interval(interval_txt: str) -> str | None: """ Transform any interval units to standard English units Only languages currently supported are English and Spanish diff --git a/learning_resources_search/indexing_api.py b/learning_resources_search/indexing_api.py index c5baf10019..e9a0d14b5f 100644 --- a/learning_resources_search/indexing_api.py +++ b/learning_resources_search/indexing_api.py @@ -398,6 +398,7 @@ def index_run_content_files(run_id, index_types): index_types (string): one of the values IndexestoUpdate. Whether the default index, the reindexing index or both need to be updated """ + return run = LearningResourceRun.objects.get(pk=run_id) content_file_ids = run.content_files.filter(published=True).values_list( "id", flat=True diff --git a/vector_search/tasks.py b/vector_search/tasks.py index 461a6692a1..5ecc4c66aa 100644 --- a/vector_search/tasks.py +++ b/vector_search/tasks.py @@ -52,6 +52,7 @@ def generate_embeddings(ids, resource_type, overwrite): resource_type (string): resource_type value for the learning resource objects """ + return None try: with wrap_retry_exception(*SEARCH_CONN_EXCEPTIONS): embed_learning_resources(ids, resource_type, overwrite) From 79927e9af4df0883f3f1cc1d2225e0299b69c123 Mon Sep 17 00:00:00 2001 From: Matt Bertrand Date: Thu, 7 Aug 2025 16:37:08 -0400 Subject: [PATCH 2/5] Some refactoring and unit tests --- learning_resources/etl/utils.py | 34 +-- learning_resources/etl/utils_test.py | 299 ++++++++++++++++++++++ learning_resources_search/indexing_api.py | 1 - main/settings_course_etl.py | 13 + vector_search/tasks.py | 1 - 5 files changed, 331 insertions(+), 17 deletions(-) diff --git a/learning_resources/etl/utils.py b/learning_resources/etl/utils.py index e4e708241b..f8193431ce 100644 --- a/learning_resources/etl/utils.py +++ b/learning_resources/etl/utils.py @@ -23,7 +23,7 @@ import boto3 import rapidjson import requests -from defusedxml.ElementTree import ParseError, parse +from defusedxml import ElementTree from django.conf import settings from django.utils.dateparse import parse_duration from django.utils.text import slugify @@ -420,10 +420,10 @@ def get_root_url_for_source(etl_source: str) -> tuple[str, str]: tuple[str, str]: The base URL and path """ mapping = { - ETLSource.mitxonline.value: "https://courses.mitxonline.mit.edu", - ETLSource.xpro.value: "https://courses.xpro.mit.edu", - ETLSource.mit_edx.value: "https://www.edx.org", - ETLSource.oll.value: "https://openlearninglibrary.mit.edu", + ETLSource.mitxonline.value: settings.CONTENT_BASE_URL_MITXONLINE, + ETLSource.xpro.value: settings.CONTENT_BASE_URL_XPRO, + ETLSource.oll.value: settings.CONTENT_BASE_URL_OLL, + ETLSource.mit_edx.value: settings.CONTENT_BASE_URL_EDX, } return mapping.get(etl_source) @@ -460,23 +460,27 @@ def get_url_from_module_id( log.warning("Module ID is empty") return None root_url = get_root_url_for_source(run.learning_resource.etl_source) - with Path.open("video_metadata.json", "w") as f: - json.dump(video_srt_metadata, f, indent=2) + # OLL needs to have 'course-v1:' added to the run_id + run_id = ( + f"course-v1:{run.run_id}" + if run.learning_resource.etl_source == ETLSource.oll.value + else run.run_id + ) if module_id.startswith("asset"): - log.info("Getting URL for asset %s", module_id) + log.debug("Getting URL for asset %s", module_id) asset_meta = ( assets_metadata.get(Path(olx_path).parts[-1], {}) if assets_metadata else {} ) video_meta = video_srt_metadata.get(module_id, {}) if video_srt_metadata else {} if video_meta: - log.info("Found video metadata for %s", module_id) - return f"{root_url}/xblock/{video_meta}" + log.debug("Found video metadata for %s", module_id) + return f"{root_url}/courses/{run_id}/jump_to/{video_meta.split('@')[-1]}" elif module_id.endswith(".srt"): - log.info("NO VIDEO METADATA FOR %s", module_id) + log.debug("No video metadata for %s", module_id) middle_path = asset_meta.get("custom_md5", "") return f"{root_url}/{(middle_path + '/') if middle_path else ''}{module_id}" elif module_id.startswith("block") and is_valid_uuid(module_id.split("@")[-1]): - return f"{root_url}/xblock/{module_id}" + return f"{root_url}/courses/{run_id}/jump_to_id/{module_id.split('@')[-1]}" else: log.warning("Unknown module ID format: %s", module_id) return None @@ -505,7 +509,7 @@ def parse_video_transcripts_xml( """ transcript_mapping = {} try: - root = parse(xml_content) + root = ElementTree.fromstring(xml_content) # Get the video url_name from the root video element video_url_name = root.get("url_name") @@ -520,7 +524,7 @@ def parse_video_transcripts_xml( transcript_mapping[ get_edx_module_id(f"static/{transcript_src}", run) ] = get_edx_module_id(str(path), run) - except ParseError: + except ElementTree.ParseError: log.exception("Error parsing video XML for %s: %s", run, path) return transcript_mapping @@ -537,7 +541,7 @@ def get_video_metadata(olx_path: str, run: LearningResourceRun) -> dict: for root, _, files in os.walk(str(Path(olx_path, "video"))): path = "/".join(root.split("/")[3:]) for filename in files: - log.info("Processing video file %s in %s", filename, path) + log.debug("Processing video file %s in %s", filename, path) extension_lower = Path(filename).suffix.lower() if extension_lower == ".xml": with Path.open(Path(root, filename), "rb") as f: diff --git a/learning_resources/etl/utils_test.py b/learning_resources/etl/utils_test.py index a210bf4bd8..9c8cb0a716 100644 --- a/learning_resources/etl/utils_test.py +++ b/learning_resources/etl/utils_test.py @@ -224,6 +224,21 @@ def test_transform_content_files( # noqa: PLR0913 "learning_resources.etl.utils.extract_text_metadata", return_value=tika_output ) + # Mock the new functions called by _process_olx_path + assets_metadata = {"test": "assets"} + video_metadata = {"test": "video"} + test_url = "https://example.com/test" + + mocker.patch( + "learning_resources.etl.utils.get_assets_metadata", return_value=assets_metadata + ) + mocker.patch( + "learning_resources.etl.utils.get_video_metadata", return_value=video_metadata + ) + mocker.patch( + "learning_resources.etl.utils.get_url_from_module_id", return_value=test_url + ) + script_dir = (pathlib.Path(__file__).parent.absolute()).parent.parent content = list( @@ -248,6 +263,7 @@ def test_transform_content_files( # noqa: PLR0913 "file_extension": file_extension, "source_path": f"root/{folder}/uuid{file_extension}", "edx_module_id": edx_module_id, + "url": test_url, } ] else: @@ -558,3 +574,286 @@ def test_parse_resource_commitment(raw_value, min_hours, max_hours): assert utils.parse_resource_commitment(raw_value) == CommitmentConfig( commitment=raw_value, min_weekly_hours=min_hours, max_weekly_hours=max_hours ) + + +@pytest.mark.parametrize( + ("uuid_string", "expected"), + [ + ("550e8400-e29b-41d4-a716-446655440000", True), + ("123e4567e89b12d3a456426614174000", True), + ("not-a-uuid", False), + ("", False), + ("123", False), + ("550e8400-e29b-41d4-a716", False), # too short + ("550e8400e29b41d4a71644665544000g", False), # invalid character + ], +) +def test_is_valid_uuid(uuid_string, expected): + """Test that is_valid_uuid correctly validates UUID strings""" + assert utils.is_valid_uuid(uuid_string) == expected + + +@pytest.mark.parametrize("assets_exist", [True, False]) +def test_get_assets_metadata(mocker, tmp_path, assets_exist): + """Test that get_assets_metadata returns correct metadata when assets.json exists""" + olx_path = tmp_path / "course" + policies_dir = olx_path / "policies" + policies_dir.mkdir(parents=True) + + if assets_exist: + assets_data = { + "course-v1:MITx+6.00x+2T2017": { + "static/image.png": {"custom_md5": "abc123"}, + "static/video.mp4": {"custom_md5": "def456"}, + } + } + assets_file = policies_dir / "assets.json" + assets_file.write_text(str(assets_data).replace("'", '"')) + + result = utils.get_assets_metadata(str(olx_path)) + assert result == assets_data + else: + # No assets.json file exists + mock_log = mocker.patch("learning_resources.etl.utils.log") + result = utils.get_assets_metadata(str(olx_path)) + assert result is None + mock_log.warning.assert_called_once() + + +@pytest.mark.parametrize( + ("xml_content", "file_name", "expected_mapping"), + [ + ( + """""", + "test_video.xml", + { + "asset-v1:test_run+type@asset+block@test_transcript.srt": "block-v1:test_run+type@video+block@test_video", + "asset-v1:test_run+type@asset+block@test_transcript_es.srt": "block-v1:test_run+type@video+block@test_video", + }, + ), + ( + """""", + "another_video.xml", + { + "asset-v1:test_run+type@asset+block@another_transcript.srt": "block-v1:test_run+type@video+block@another_video" + }, + ), + ( + """""", + "no_url_name.xml", + {}, # No url_name, should return empty dict + ), + ( + """""", + "no_transcripts.xml", + {}, # No transcripts, should return empty dict + ), + ( + """invalid xml content""", + "invalid.xml", + {}, # Invalid XML, should return empty dict + ), + ], +) +def test_parse_video_transcripts_xml(mocker, xml_content, file_name, expected_mapping): + """Test that parse_video_transcripts_xml correctly parses video XML and creates transcript mapping""" + run = LearningResourceRunFactory.create(run_id="course-v1:test_run") + path = mocker.Mock() + path.__str__ = mocker.Mock(return_value=f"video/{file_name}") + + mock_log = mocker.patch("learning_resources.etl.utils.log") + + result = utils.parse_video_transcripts_xml(run, xml_content, path) + + assert result == expected_mapping + + # Check if warning/exception was logged for invalid XML + if "invalid xml" in xml_content.lower(): + mock_log.exception.assert_called_once() + + +@pytest.mark.parametrize("video_dir_exists", [True, False]) +def test_get_video_metadata(mocker, tmp_path, video_dir_exists): + """Test that get_video_metadata correctly processes video directory and returns transcript mappings""" + run = LearningResourceRunFactory.create(run_id="course-v1:test_run") + olx_path = tmp_path / "course" + olx_path.mkdir() + + mock_log = mocker.patch("learning_resources.etl.utils.log") + + if video_dir_exists: + video_dir = olx_path / "video" + video_dir.mkdir() + + # Create a test video XML file + video_xml = """""" + + video_file = video_dir / "test_video.xml" + video_file.write_text(video_xml) + + # Mock parse_video_transcripts_xml to return expected mapping + expected_mapping = { + "asset-v1:test_run+type@asset+block@test_transcript1.srt": "block-v1:test_run+type@video+block@test_video", + "asset-v1:test_run+type@asset+block@test_transcript2.srt": "block-v1:test_run+type@video+block@test_video", + } + mock_parse = mocker.patch( + "learning_resources.etl.utils.parse_video_transcripts_xml", + return_value=expected_mapping, + ) + + result = utils.get_video_metadata(str(olx_path), run) + + assert result == expected_mapping + # The function passes a file object to parse_video_transcripts_xml, not a Path + assert mock_parse.call_count == 1 + call_args = mock_parse.call_args[0] + assert call_args[0] == run + assert call_args[1] == video_xml + # The third argument is a file object, so we can't easily check its exact value + mock_log.debug.assert_called() + else: + # No video directory + result = utils.get_video_metadata(str(olx_path), run) + assert result == {} + mock_log.warning.assert_called_once() + + +@pytest.mark.parametrize( + ( + "etl_source", + "module_id", + "has_video_meta", + "has_asset_meta", + "expected_url_pattern", + ), + [ + # Asset URLs + ( + "mit_edx", + "asset-v1:test+type@asset+block@image.png", + False, + True, + "https://edx.org/asset-v1:test+type@asset+block@image.png", + ), # No custom_md5 in path + ( + "mit_edx", + "asset-v1:test+type@asset+block@video.mp4", + False, + False, + "https://edx.org/asset-v1:test+type@asset+block@video.mp4", + ), + ( + "mit_edx", + "asset-v1:test+type@asset+block@transcript.srt", + True, + False, + "https://edx.org/courses/course-v1:test_run/jump_to/test_video", + ), + ( + "mit_edx", + "asset-v1:test+type@asset+block@transcript.srt", + False, + False, + "https://edx.org/asset-v1:test+type@asset+block@transcript.srt", + ), # SRT without video meta returns asset URL + # Block URLs with valid UUID + ( + "mit_edx", + "block-v1:test+type@html+block@550e8400-e29b-41d4-a716-446655440000", + False, + False, + "https://edx.org/courses/course-v1:test_run/jump_to_id/550e8400-e29b-41d4-a716-446655440000", + ), + # OLL source with run_id modification + ( + "oll", + "block-v1:test+type@html+block@550e8400-e29b-41d4-a716-446655440000", + False, + False, + "https://oll.org/courses/course-v1:course-v1:test_run/jump_to_id/550e8400-e29b-41d4-a716-446655440000", + ), + # Invalid cases + ( + "", + "asset-v1:test+type@asset+block@file.txt", + False, + False, + "None/asset-v1:test+type@asset+block@file.txt", + ), # Empty etl_source returns None as root_url + ( + "mit_edx", + "block-v1:test+type@html+block@invalid-uuid", + False, + False, + None, + ), # Invalid UUID + ("mit_edx", "unknown-format", False, False, None), # Unknown format + ], +) +def test_get_url_from_module_id( # noqa: PLR0913 + mocker, + settings, + etl_source, + module_id, + has_video_meta, + has_asset_meta, + expected_url_pattern, +): + """Test that get_url_from_module_id generates correct URLs for different module types""" + # Setup settings + settings.CONTENT_BASE_URL_EDX = "https://edx.org" + settings.CONTENT_BASE_URL_OLL = "https://oll.org" + + run = LearningResourceRunFactory.create( + run_id="course-v1:test_run", learning_resource__etl_source=etl_source + ) + + olx_path = "/path/to/course-v1:test_run" + + # Setup metadata + assets_metadata = ( + { + "course-v1:test_run": { + "static/image.png": {} # No custom_md5 for simplicity + } + } + if has_asset_meta + else None + ) + + video_srt_metadata = ( + { + "asset-v1:test+type@asset+block@transcript.srt": "block-v1:test+type@video+block@test_video" + } + if has_video_meta + else None + ) + + # Mock log for warning cases + mock_log = mocker.patch("learning_resources.etl.utils.log") + + result = utils.get_url_from_module_id( + olx_path, module_id, run, assets_metadata, video_srt_metadata + ) + + if expected_url_pattern: + assert result == expected_url_pattern + else: + assert result is None + if not module_id: + mock_log.warning.assert_any_call("Module ID is empty") + elif module_id.endswith(".srt") and not has_video_meta: + mock_log.debug.assert_any_call("No video metadata for %s", module_id) + elif "unknown-format" in module_id or "invalid-uuid" in module_id: + mock_log.warning.assert_any_call("Unknown module ID format: %s", module_id) diff --git a/learning_resources_search/indexing_api.py b/learning_resources_search/indexing_api.py index e9a0d14b5f..c5baf10019 100644 --- a/learning_resources_search/indexing_api.py +++ b/learning_resources_search/indexing_api.py @@ -398,7 +398,6 @@ def index_run_content_files(run_id, index_types): index_types (string): one of the values IndexestoUpdate. Whether the default index, the reindexing index or both need to be updated """ - return run = LearningResourceRun.objects.get(pk=run_id) content_file_ids = run.content_files.filter(published=True).values_list( "id", flat=True diff --git a/main/settings_course_etl.py b/main/settings_course_etl.py index a1bef5b17f..4b8ec0c6fd 100644 --- a/main/settings_course_etl.py +++ b/main/settings_course_etl.py @@ -105,3 +105,16 @@ TIKA_TIMEOUT = get_int("TIKA_TIMEOUT", 60) TIKA_OCR_STRATEGY = get_string("TIKA_OCR_STRATEGY", "no_ocr") SKIP_TIKA = get_bool("SKIP_TIKA", default=False) + + +# Base content URLs for different sources +CONTENT_BASE_URL_MITXONLINE = get_string( + "CONTENT_BASE_URL_MITXONLINE", "https://courses.mitxonline.mit.edu" +) +CONTENT_BASE_URL_XPRO = get_string( + "CONTENT_BASE_URL_XPRO", "https://courses.xpro.mit.edu" +) +CONTENT_BASE_URL_OLL = get_string( + "CONTENT_BASE_URL_OLL", "https://openlearninglibrary.mit.edu" +) +CONTENT_BASE_URL_EDX = get_string("CONTENT_BASE_URL_EDX", "https://courses.edx.org") diff --git a/vector_search/tasks.py b/vector_search/tasks.py index 5ecc4c66aa..461a6692a1 100644 --- a/vector_search/tasks.py +++ b/vector_search/tasks.py @@ -52,7 +52,6 @@ def generate_embeddings(ids, resource_type, overwrite): resource_type (string): resource_type value for the learning resource objects """ - return None try: with wrap_retry_exception(*SEARCH_CONN_EXCEPTIONS): embed_learning_resources(ids, resource_type, overwrite) From 7d71966371920e2acf127e71c18d7406d9b7e000 Mon Sep 17 00:00:00 2001 From: Matt Bertrand Date: Fri, 8 Aug 2025 07:48:44 -0400 Subject: [PATCH 3/5] Get rid of some debug logging --- learning_resources/etl/utils.py | 9 ++------- learning_resources/etl/utils_test.py | 4 ---- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/learning_resources/etl/utils.py b/learning_resources/etl/utils.py index f8193431ce..544a3be51a 100644 --- a/learning_resources/etl/utils.py +++ b/learning_resources/etl/utils.py @@ -467,16 +467,13 @@ def get_url_from_module_id( else run.run_id ) if module_id.startswith("asset"): - log.debug("Getting URL for asset %s", module_id) asset_meta = ( assets_metadata.get(Path(olx_path).parts[-1], {}) if assets_metadata else {} ) video_meta = video_srt_metadata.get(module_id, {}) if video_srt_metadata else {} if video_meta: - log.debug("Found video metadata for %s", module_id) + # Link to the parent video return f"{root_url}/courses/{run_id}/jump_to/{video_meta.split('@')[-1]}" - elif module_id.endswith(".srt"): - log.debug("No video metadata for %s", module_id) middle_path = asset_meta.get("custom_md5", "") return f"{root_url}/{(middle_path + '/') if middle_path else ''}{module_id}" elif module_id.startswith("block") and is_valid_uuid(module_id.split("@")[-1]): @@ -531,7 +528,7 @@ def parse_video_transcripts_xml( def get_video_metadata(olx_path: str, run: LearningResourceRun) -> dict: """ - Get metadata for video SRT files in an OLX path + Get metadata for video SRT/VTT files in an OLX path """ video_transcript_mapping = {} video_path = Path(olx_path, "video") @@ -539,9 +536,7 @@ def get_video_metadata(olx_path: str, run: LearningResourceRun) -> dict: log.warning("No video directory found in OLX path: %s", olx_path) return video_transcript_mapping for root, _, files in os.walk(str(Path(olx_path, "video"))): - path = "/".join(root.split("/")[3:]) for filename in files: - log.debug("Processing video file %s in %s", filename, path) extension_lower = Path(filename).suffix.lower() if extension_lower == ".xml": with Path.open(Path(root, filename), "rb") as f: diff --git a/learning_resources/etl/utils_test.py b/learning_resources/etl/utils_test.py index 9c8cb0a716..89b91d803a 100644 --- a/learning_resources/etl/utils_test.py +++ b/learning_resources/etl/utils_test.py @@ -711,17 +711,13 @@ def test_get_video_metadata(mocker, tmp_path, video_dir_exists): "learning_resources.etl.utils.parse_video_transcripts_xml", return_value=expected_mapping, ) - result = utils.get_video_metadata(str(olx_path), run) assert result == expected_mapping - # The function passes a file object to parse_video_transcripts_xml, not a Path assert mock_parse.call_count == 1 call_args = mock_parse.call_args[0] assert call_args[0] == run assert call_args[1] == video_xml - # The third argument is a file object, so we can't easily check its exact value - mock_log.debug.assert_called() else: # No video directory result = utils.get_video_metadata(str(olx_path), run) From e31026768644489a23749c6009668189e45c9d0c Mon Sep 17 00:00:00 2001 From: Matt Bertrand Date: Fri, 8 Aug 2025 09:28:09 -0400 Subject: [PATCH 4/5] Remove some noisy logging --- learning_resources/etl/utils.py | 6 ++---- learning_resources/etl/utils_test.py | 21 ++------------------- 2 files changed, 4 insertions(+), 23 deletions(-) diff --git a/learning_resources/etl/utils.py b/learning_resources/etl/utils.py index 544a3be51a..24f63bf049 100644 --- a/learning_resources/etl/utils.py +++ b/learning_resources/etl/utils.py @@ -457,7 +457,6 @@ def get_url_from_module_id( str: The URL for the module """ if not module_id: - log.warning("Module ID is empty") return None root_url = get_root_url_for_source(run.learning_resource.etl_source) # OLL needs to have 'course-v1:' added to the run_id @@ -479,7 +478,6 @@ def get_url_from_module_id( elif module_id.startswith("block") and is_valid_uuid(module_id.split("@")[-1]): return f"{root_url}/courses/{run_id}/jump_to_id/{module_id.split('@')[-1]}" else: - log.warning("Unknown module ID format: %s", module_id) return None @@ -494,7 +492,7 @@ def get_assets_metadata(olx_path: str) -> dict: with Path.open(Path(olx_path, "policies/assets.json"), "rb") as f: return json.loads(f.read()) except FileNotFoundError: - log.warning("Assets metadata file does not exist: %s", olx_path) + log.debug("Assets metadata file does not exist: %s", olx_path) def parse_video_transcripts_xml( @@ -533,7 +531,7 @@ def get_video_metadata(olx_path: str, run: LearningResourceRun) -> dict: video_transcript_mapping = {} video_path = Path(olx_path, "video") if not video_path.exists(): - log.warning("No video directory found in OLX path: %s", olx_path) + log.debug("No video directory found in OLX path: %s", olx_path) return video_transcript_mapping for root, _, files in os.walk(str(Path(olx_path, "video"))): for filename in files: diff --git a/learning_resources/etl/utils_test.py b/learning_resources/etl/utils_test.py index 89b91d803a..5ce3d79f1e 100644 --- a/learning_resources/etl/utils_test.py +++ b/learning_resources/etl/utils_test.py @@ -614,10 +614,7 @@ def test_get_assets_metadata(mocker, tmp_path, assets_exist): assert result == assets_data else: # No assets.json file exists - mock_log = mocker.patch("learning_resources.etl.utils.log") - result = utils.get_assets_metadata(str(olx_path)) - assert result is None - mock_log.warning.assert_called_once() + assert utils.get_assets_metadata(str(olx_path)) is None @pytest.mark.parametrize( @@ -687,8 +684,6 @@ def test_get_video_metadata(mocker, tmp_path, video_dir_exists): olx_path = tmp_path / "course" olx_path.mkdir() - mock_log = mocker.patch("learning_resources.etl.utils.log") - if video_dir_exists: video_dir = olx_path / "video" video_dir.mkdir() @@ -720,9 +715,7 @@ def test_get_video_metadata(mocker, tmp_path, video_dir_exists): assert call_args[1] == video_xml else: # No video directory - result = utils.get_video_metadata(str(olx_path), run) - assert result == {} - mock_log.warning.assert_called_once() + assert utils.get_video_metadata(str(olx_path), run) == {} @pytest.mark.parametrize( @@ -798,7 +791,6 @@ def test_get_video_metadata(mocker, tmp_path, video_dir_exists): ], ) def test_get_url_from_module_id( # noqa: PLR0913 - mocker, settings, etl_source, module_id, @@ -836,9 +828,6 @@ def test_get_url_from_module_id( # noqa: PLR0913 else None ) - # Mock log for warning cases - mock_log = mocker.patch("learning_resources.etl.utils.log") - result = utils.get_url_from_module_id( olx_path, module_id, run, assets_metadata, video_srt_metadata ) @@ -847,9 +836,3 @@ def test_get_url_from_module_id( # noqa: PLR0913 assert result == expected_url_pattern else: assert result is None - if not module_id: - mock_log.warning.assert_any_call("Module ID is empty") - elif module_id.endswith(".srt") and not has_video_meta: - mock_log.debug.assert_any_call("No video metadata for %s", module_id) - elif "unknown-format" in module_id or "invalid-uuid" in module_id: - mock_log.warning.assert_any_call("Unknown module ID format: %s", module_id) From f05f8c737181be865e2b3aa4994492d504f962fa Mon Sep 17 00:00:00 2001 From: Matt Bertrand Date: Fri, 8 Aug 2025 14:14:01 -0400 Subject: [PATCH 5/5] assets md5/middle path not necessary --- learning_resources/etl/utils.py | 27 +------------ learning_resources/etl/utils_test.py | 60 ++-------------------------- 2 files changed, 6 insertions(+), 81 deletions(-) diff --git a/learning_resources/etl/utils.py b/learning_resources/etl/utils.py index 24f63bf049..06a31682d3 100644 --- a/learning_resources/etl/utils.py +++ b/learning_resources/etl/utils.py @@ -440,10 +440,8 @@ def is_valid_uuid(uuid_string: str) -> bool: def get_url_from_module_id( - olx_path: str, module_id: str, run: LearningResourceRun, - assets_metadata: Optional[dict] = None, video_srt_metadata: Optional[dict] = None, ) -> str: """ @@ -466,35 +464,17 @@ def get_url_from_module_id( else run.run_id ) if module_id.startswith("asset"): - asset_meta = ( - assets_metadata.get(Path(olx_path).parts[-1], {}) if assets_metadata else {} - ) video_meta = video_srt_metadata.get(module_id, {}) if video_srt_metadata else {} if video_meta: # Link to the parent video return f"{root_url}/courses/{run_id}/jump_to/{video_meta.split('@')[-1]}" - middle_path = asset_meta.get("custom_md5", "") - return f"{root_url}/{(middle_path + '/') if middle_path else ''}{module_id}" + return f"{root_url}/{module_id}" elif module_id.startswith("block") and is_valid_uuid(module_id.split("@")[-1]): return f"{root_url}/courses/{run_id}/jump_to_id/{module_id.split('@')[-1]}" else: return None -def get_assets_metadata(olx_path: str) -> dict: - """ - Get metadata for assets in an OLX path - - Args: - olx_path (str): The path to the OLX directory - """ - try: - with Path.open(Path(olx_path, "policies/assets.json"), "rb") as f: - return json.loads(f.read()) - except FileNotFoundError: - log.debug("Assets metadata file does not exist: %s", olx_path) - - def parse_video_transcripts_xml( run: LearningResourceRun, xml_content: str, path: Path ) -> dict: @@ -548,7 +528,6 @@ def get_video_metadata(olx_path: str, run: LearningResourceRun) -> dict: def _process_olx_path(olx_path: str, run: LearningResourceRun, *, overwrite): - assets_metadata = get_assets_metadata(olx_path) video_srt_metadata = get_video_metadata(olx_path, run) for document, metadata in documents_from_olx(olx_path): source_path = metadata.get("source_path") @@ -607,9 +586,7 @@ def _process_olx_path(olx_path: str, run: LearningResourceRun, *, overwrite): "file_extension": file_extension, "source_path": source_path, "edx_module_id": edx_module_id, - "url": get_url_from_module_id( - source_path, edx_module_id, run, assets_metadata, video_srt_metadata - ), + "url": get_url_from_module_id(edx_module_id, run, video_srt_metadata), **content_dict, } ) diff --git a/learning_resources/etl/utils_test.py b/learning_resources/etl/utils_test.py index 5ce3d79f1e..1ad8631436 100644 --- a/learning_resources/etl/utils_test.py +++ b/learning_resources/etl/utils_test.py @@ -225,13 +225,9 @@ def test_transform_content_files( # noqa: PLR0913 ) # Mock the new functions called by _process_olx_path - assets_metadata = {"test": "assets"} video_metadata = {"test": "video"} test_url = "https://example.com/test" - mocker.patch( - "learning_resources.etl.utils.get_assets_metadata", return_value=assets_metadata - ) mocker.patch( "learning_resources.etl.utils.get_video_metadata", return_value=video_metadata ) @@ -593,30 +589,6 @@ def test_is_valid_uuid(uuid_string, expected): assert utils.is_valid_uuid(uuid_string) == expected -@pytest.mark.parametrize("assets_exist", [True, False]) -def test_get_assets_metadata(mocker, tmp_path, assets_exist): - """Test that get_assets_metadata returns correct metadata when assets.json exists""" - olx_path = tmp_path / "course" - policies_dir = olx_path / "policies" - policies_dir.mkdir(parents=True) - - if assets_exist: - assets_data = { - "course-v1:MITx+6.00x+2T2017": { - "static/image.png": {"custom_md5": "abc123"}, - "static/video.mp4": {"custom_md5": "def456"}, - } - } - assets_file = policies_dir / "assets.json" - assets_file.write_text(str(assets_data).replace("'", '"')) - - result = utils.get_assets_metadata(str(olx_path)) - assert result == assets_data - else: - # No assets.json file exists - assert utils.get_assets_metadata(str(olx_path)) is None - - @pytest.mark.parametrize( ("xml_content", "file_name", "expected_mapping"), [ @@ -723,7 +695,6 @@ def test_get_video_metadata(mocker, tmp_path, video_dir_exists): "etl_source", "module_id", "has_video_meta", - "has_asset_meta", "expected_url_pattern", ), [ @@ -732,28 +703,24 @@ def test_get_video_metadata(mocker, tmp_path, video_dir_exists): "mit_edx", "asset-v1:test+type@asset+block@image.png", False, - True, "https://edx.org/asset-v1:test+type@asset+block@image.png", - ), # No custom_md5 in path + ), ( "mit_edx", "asset-v1:test+type@asset+block@video.mp4", False, - False, "https://edx.org/asset-v1:test+type@asset+block@video.mp4", ), ( "mit_edx", "asset-v1:test+type@asset+block@transcript.srt", True, - False, "https://edx.org/courses/course-v1:test_run/jump_to/test_video", ), ( "mit_edx", "asset-v1:test+type@asset+block@transcript.srt", False, - False, "https://edx.org/asset-v1:test+type@asset+block@transcript.srt", ), # SRT without video meta returns asset URL # Block URLs with valid UUID @@ -761,7 +728,6 @@ def test_get_video_metadata(mocker, tmp_path, video_dir_exists): "mit_edx", "block-v1:test+type@html+block@550e8400-e29b-41d4-a716-446655440000", False, - False, "https://edx.org/courses/course-v1:test_run/jump_to_id/550e8400-e29b-41d4-a716-446655440000", ), # OLL source with run_id modification @@ -769,7 +735,6 @@ def test_get_video_metadata(mocker, tmp_path, video_dir_exists): "oll", "block-v1:test+type@html+block@550e8400-e29b-41d4-a716-446655440000", False, - False, "https://oll.org/courses/course-v1:course-v1:test_run/jump_to_id/550e8400-e29b-41d4-a716-446655440000", ), # Invalid cases @@ -777,25 +742,22 @@ def test_get_video_metadata(mocker, tmp_path, video_dir_exists): "", "asset-v1:test+type@asset+block@file.txt", False, - False, "None/asset-v1:test+type@asset+block@file.txt", ), # Empty etl_source returns None as root_url ( "mit_edx", "block-v1:test+type@html+block@invalid-uuid", False, - False, None, ), # Invalid UUID - ("mit_edx", "unknown-format", False, False, None), # Unknown format + ("mit_edx", "unknown-format", False, None), # Unknown format ], ) -def test_get_url_from_module_id( # noqa: PLR0913 +def test_get_url_from_module_id( settings, etl_source, module_id, has_video_meta, - has_asset_meta, expected_url_pattern, ): """Test that get_url_from_module_id generates correct URLs for different module types""" @@ -807,19 +769,7 @@ def test_get_url_from_module_id( # noqa: PLR0913 run_id="course-v1:test_run", learning_resource__etl_source=etl_source ) - olx_path = "/path/to/course-v1:test_run" - # Setup metadata - assets_metadata = ( - { - "course-v1:test_run": { - "static/image.png": {} # No custom_md5 for simplicity - } - } - if has_asset_meta - else None - ) - video_srt_metadata = ( { "asset-v1:test+type@asset+block@transcript.srt": "block-v1:test+type@video+block@test_video" @@ -828,9 +778,7 @@ def test_get_url_from_module_id( # noqa: PLR0913 else None ) - result = utils.get_url_from_module_id( - olx_path, module_id, run, assets_metadata, video_srt_metadata - ) + result = utils.get_url_from_module_id(module_id, run, video_srt_metadata) if expected_url_pattern: assert result == expected_url_pattern