diff --git a/learning_resources/etl/utils.py b/learning_resources/etl/utils.py index 2e9058e261..06a31682d3 100644 --- a/learning_resources/etl/utils.py +++ b/learning_resources/etl/utils.py @@ -18,10 +18,12 @@ from pathlib import Path from subprocess import check_call from tempfile import TemporaryDirectory +from typing import Optional import boto3 import rapidjson import requests +from defusedxml import ElementTree from django.conf import settings from django.utils.dateparse import parse_duration from django.utils.text import slugify @@ -341,7 +343,7 @@ def documents_from_olx( "mime_type": mimetype, "archive_checksum": archive_checksum, "file_extension": extension_lower, - "source_path": f"{path}/{filename}", + "source_path": f"{path}/{filename.replace(' ', '_')}", }, ) @@ -407,7 +409,126 @@ def text_from_sjson_content(content: str): return " ".join(data.get("text", [])) +def get_root_url_for_source(etl_source: str) -> tuple[str, str]: + """ + Get the base URL and path for an ETL source + + Args: + etl_source (str): The ETL source path + + Returns: + tuple[str, str]: The base URL and path + """ + mapping = { + ETLSource.mitxonline.value: settings.CONTENT_BASE_URL_MITXONLINE, + ETLSource.xpro.value: settings.CONTENT_BASE_URL_XPRO, + ETLSource.oll.value: settings.CONTENT_BASE_URL_OLL, + ETLSource.mit_edx.value: settings.CONTENT_BASE_URL_EDX, + } + return mapping.get(etl_source) + + +def is_valid_uuid(uuid_string: str) -> bool: + """ + Check if a string is a valid UUID + """ + try: + uuid.UUID(uuid_string) + except ValueError: + return False + return True + + +def get_url_from_module_id( + module_id: str, + run: LearningResourceRun, + video_srt_metadata: Optional[dict] = None, +) -> str: + """ + Get the URL for a module based on its ID + + Args: + module_id (str): The module ID + run (LearningResourceRun): The run associated with the module + + Returns: + str: The URL for the module + """ + if not module_id: + return None + root_url = get_root_url_for_source(run.learning_resource.etl_source) + # OLL needs to have 'course-v1:' added to the run_id + run_id = ( + f"course-v1:{run.run_id}" + if run.learning_resource.etl_source == ETLSource.oll.value + else run.run_id + ) + if module_id.startswith("asset"): + video_meta = video_srt_metadata.get(module_id, {}) if video_srt_metadata else {} + if video_meta: + # Link to the parent video + return f"{root_url}/courses/{run_id}/jump_to/{video_meta.split('@')[-1]}" + return f"{root_url}/{module_id}" + elif module_id.startswith("block") and is_valid_uuid(module_id.split("@")[-1]): + return f"{root_url}/courses/{run_id}/jump_to_id/{module_id.split('@')[-1]}" + else: + return None + + +def parse_video_transcripts_xml( + run: LearningResourceRun, xml_content: str, path: Path +) -> dict: + """ + Parse video XML content and create a mapping of + transcript edx_module_id to video edx_module_id + """ + transcript_mapping = {} + try: + root = ElementTree.fromstring(xml_content) + + # Get the video url_name from the root video element + video_url_name = root.get("url_name") + if not video_url_name: + log.warning("No url_name found in video XML") + return {} + + # Find all transcript elements and extract their src attributes + for transcript in root.findall(".//transcript"): + transcript_src = transcript.get("src") + if transcript_src: + transcript_mapping[ + get_edx_module_id(f"static/{transcript_src}", run) + ] = get_edx_module_id(str(path), run) + except ElementTree.ParseError: + log.exception("Error parsing video XML for %s: %s", run, path) + return transcript_mapping + + +def get_video_metadata(olx_path: str, run: LearningResourceRun) -> dict: + """ + Get metadata for video SRT/VTT files in an OLX path + """ + video_transcript_mapping = {} + video_path = Path(olx_path, "video") + if not video_path.exists(): + log.debug("No video directory found in OLX path: %s", olx_path) + return video_transcript_mapping + for root, _, files in os.walk(str(Path(olx_path, "video"))): + for filename in files: + extension_lower = Path(filename).suffix.lower() + if extension_lower == ".xml": + with Path.open(Path(root, filename), "rb") as f: + video_xml = f.read().decode("utf-8") + + # Parse the XML and get transcript mappings + transcript_mapping = parse_video_transcripts_xml(run, video_xml, f) + video_transcript_mapping.update(transcript_mapping) + + return video_transcript_mapping + + def _process_olx_path(olx_path: str, run: LearningResourceRun, *, overwrite): + video_srt_metadata = get_video_metadata(olx_path, run) for document, metadata in documents_from_olx(olx_path): source_path = metadata.get("source_path") edx_module_id = get_edx_module_id(source_path, run) @@ -465,6 +586,7 @@ def _process_olx_path(olx_path: str, run: LearningResourceRun, *, overwrite): "file_extension": file_extension, "source_path": source_path, "edx_module_id": edx_module_id, + "url": get_url_from_module_id(edx_module_id, run, video_srt_metadata), **content_dict, } ) @@ -741,7 +863,7 @@ def parse_certification(offeror, runs_data): ) -def iso8601_duration(duration_str: str) -> str or None: +def iso8601_duration(duration_str: str) -> str | None: """ Parse the duration from a string and return it in ISO-8601 format @@ -821,7 +943,7 @@ def calculate_weeks(num: int, from_unit: str) -> int: return num -def transform_interval(interval_txt: str) -> str or None: +def transform_interval(interval_txt: str) -> str | None: """ Transform any interval units to standard English units Only languages currently supported are English and Spanish diff --git a/learning_resources/etl/utils_test.py b/learning_resources/etl/utils_test.py index a210bf4bd8..1ad8631436 100644 --- a/learning_resources/etl/utils_test.py +++ b/learning_resources/etl/utils_test.py @@ -224,6 +224,17 @@ def test_transform_content_files( # noqa: PLR0913 "learning_resources.etl.utils.extract_text_metadata", return_value=tika_output ) + # Mock the new functions called by _process_olx_path + video_metadata = {"test": "video"} + test_url = "https://example.com/test" + + mocker.patch( + "learning_resources.etl.utils.get_video_metadata", return_value=video_metadata + ) + mocker.patch( + "learning_resources.etl.utils.get_url_from_module_id", return_value=test_url + ) + script_dir = (pathlib.Path(__file__).parent.absolute()).parent.parent content = list( @@ -248,6 +259,7 @@ def test_transform_content_files( # noqa: PLR0913 "file_extension": file_extension, "source_path": f"root/{folder}/uuid{file_extension}", "edx_module_id": edx_module_id, + "url": test_url, } ] else: @@ -558,3 +570,217 @@ def test_parse_resource_commitment(raw_value, min_hours, max_hours): assert utils.parse_resource_commitment(raw_value) == CommitmentConfig( commitment=raw_value, min_weekly_hours=min_hours, max_weekly_hours=max_hours ) + + +@pytest.mark.parametrize( + ("uuid_string", "expected"), + [ + ("550e8400-e29b-41d4-a716-446655440000", True), + ("123e4567e89b12d3a456426614174000", True), + ("not-a-uuid", False), + ("", False), + ("123", False), + ("550e8400-e29b-41d4-a716", False), # too short + ("550e8400e29b41d4a71644665544000g", False), # invalid character + ], +) +def test_is_valid_uuid(uuid_string, expected): + """Test that is_valid_uuid correctly validates UUID strings""" + assert utils.is_valid_uuid(uuid_string) == expected + + +@pytest.mark.parametrize( + ("xml_content", "file_name", "expected_mapping"), + [ + ( + """""", + "test_video.xml", + { + "asset-v1:test_run+type@asset+block@test_transcript.srt": "block-v1:test_run+type@video+block@test_video", + "asset-v1:test_run+type@asset+block@test_transcript_es.srt": "block-v1:test_run+type@video+block@test_video", + }, + ), + ( + """""", + "another_video.xml", + { + "asset-v1:test_run+type@asset+block@another_transcript.srt": "block-v1:test_run+type@video+block@another_video" + }, + ), + ( + """""", + "no_url_name.xml", + {}, # No url_name, should return empty dict + ), + ( + """""", + "no_transcripts.xml", + {}, # No transcripts, should return empty dict + ), + ( + """invalid xml content""", + "invalid.xml", + {}, # Invalid XML, should return empty dict + ), + ], +) +def test_parse_video_transcripts_xml(mocker, xml_content, file_name, expected_mapping): + """Test that parse_video_transcripts_xml correctly parses video XML and creates transcript mapping""" + run = LearningResourceRunFactory.create(run_id="course-v1:test_run") + path = mocker.Mock() + path.__str__ = mocker.Mock(return_value=f"video/{file_name}") + + mock_log = mocker.patch("learning_resources.etl.utils.log") + + result = utils.parse_video_transcripts_xml(run, xml_content, path) + + assert result == expected_mapping + + # Check if warning/exception was logged for invalid XML + if "invalid xml" in xml_content.lower(): + mock_log.exception.assert_called_once() + + +@pytest.mark.parametrize("video_dir_exists", [True, False]) +def test_get_video_metadata(mocker, tmp_path, video_dir_exists): + """Test that get_video_metadata correctly processes video directory and returns transcript mappings""" + run = LearningResourceRunFactory.create(run_id="course-v1:test_run") + olx_path = tmp_path / "course" + olx_path.mkdir() + + if video_dir_exists: + video_dir = olx_path / "video" + video_dir.mkdir() + + # Create a test video XML file + video_xml = """""" + + video_file = video_dir / "test_video.xml" + video_file.write_text(video_xml) + + # Mock parse_video_transcripts_xml to return expected mapping + expected_mapping = { + "asset-v1:test_run+type@asset+block@test_transcript1.srt": "block-v1:test_run+type@video+block@test_video", + "asset-v1:test_run+type@asset+block@test_transcript2.srt": "block-v1:test_run+type@video+block@test_video", + } + mock_parse = mocker.patch( + "learning_resources.etl.utils.parse_video_transcripts_xml", + return_value=expected_mapping, + ) + result = utils.get_video_metadata(str(olx_path), run) + + assert result == expected_mapping + assert mock_parse.call_count == 1 + call_args = mock_parse.call_args[0] + assert call_args[0] == run + assert call_args[1] == video_xml + else: + # No video directory + assert utils.get_video_metadata(str(olx_path), run) == {} + + +@pytest.mark.parametrize( + ( + "etl_source", + "module_id", + "has_video_meta", + "expected_url_pattern", + ), + [ + # Asset URLs + ( + "mit_edx", + "asset-v1:test+type@asset+block@image.png", + False, + "https://edx.org/asset-v1:test+type@asset+block@image.png", + ), + ( + "mit_edx", + "asset-v1:test+type@asset+block@video.mp4", + False, + "https://edx.org/asset-v1:test+type@asset+block@video.mp4", + ), + ( + "mit_edx", + "asset-v1:test+type@asset+block@transcript.srt", + True, + "https://edx.org/courses/course-v1:test_run/jump_to/test_video", + ), + ( + "mit_edx", + "asset-v1:test+type@asset+block@transcript.srt", + False, + "https://edx.org/asset-v1:test+type@asset+block@transcript.srt", + ), # SRT without video meta returns asset URL + # Block URLs with valid UUID + ( + "mit_edx", + "block-v1:test+type@html+block@550e8400-e29b-41d4-a716-446655440000", + False, + "https://edx.org/courses/course-v1:test_run/jump_to_id/550e8400-e29b-41d4-a716-446655440000", + ), + # OLL source with run_id modification + ( + "oll", + "block-v1:test+type@html+block@550e8400-e29b-41d4-a716-446655440000", + False, + "https://oll.org/courses/course-v1:course-v1:test_run/jump_to_id/550e8400-e29b-41d4-a716-446655440000", + ), + # Invalid cases + ( + "", + "asset-v1:test+type@asset+block@file.txt", + False, + "None/asset-v1:test+type@asset+block@file.txt", + ), # Empty etl_source returns None as root_url + ( + "mit_edx", + "block-v1:test+type@html+block@invalid-uuid", + False, + None, + ), # Invalid UUID + ("mit_edx", "unknown-format", False, None), # Unknown format + ], +) +def test_get_url_from_module_id( + settings, + etl_source, + module_id, + has_video_meta, + expected_url_pattern, +): + """Test that get_url_from_module_id generates correct URLs for different module types""" + # Setup settings + settings.CONTENT_BASE_URL_EDX = "https://edx.org" + settings.CONTENT_BASE_URL_OLL = "https://oll.org" + + run = LearningResourceRunFactory.create( + run_id="course-v1:test_run", learning_resource__etl_source=etl_source + ) + + # Setup metadata + video_srt_metadata = ( + { + "asset-v1:test+type@asset+block@transcript.srt": "block-v1:test+type@video+block@test_video" + } + if has_video_meta + else None + ) + + result = utils.get_url_from_module_id(module_id, run, video_srt_metadata) + + if expected_url_pattern: + assert result == expected_url_pattern + else: + assert result is None diff --git a/main/settings_course_etl.py b/main/settings_course_etl.py index a1bef5b17f..4b8ec0c6fd 100644 --- a/main/settings_course_etl.py +++ b/main/settings_course_etl.py @@ -105,3 +105,16 @@ TIKA_TIMEOUT = get_int("TIKA_TIMEOUT", 60) TIKA_OCR_STRATEGY = get_string("TIKA_OCR_STRATEGY", "no_ocr") SKIP_TIKA = get_bool("SKIP_TIKA", default=False) + + +# Base content URLs for different sources +CONTENT_BASE_URL_MITXONLINE = get_string( + "CONTENT_BASE_URL_MITXONLINE", "https://courses.mitxonline.mit.edu" +) +CONTENT_BASE_URL_XPRO = get_string( + "CONTENT_BASE_URL_XPRO", "https://courses.xpro.mit.edu" +) +CONTENT_BASE_URL_OLL = get_string( + "CONTENT_BASE_URL_OLL", "https://openlearninglibrary.mit.edu" +) +CONTENT_BASE_URL_EDX = get_string("CONTENT_BASE_URL_EDX", "https://courses.edx.org")