diff --git a/learning_resources/etl/utils.py b/learning_resources/etl/utils.py
index 2e9058e261..06a31682d3 100644
--- a/learning_resources/etl/utils.py
+++ b/learning_resources/etl/utils.py
@@ -18,10 +18,12 @@
from pathlib import Path
from subprocess import check_call
from tempfile import TemporaryDirectory
+from typing import Optional
import boto3
import rapidjson
import requests
+from defusedxml import ElementTree
from django.conf import settings
from django.utils.dateparse import parse_duration
from django.utils.text import slugify
@@ -341,7 +343,7 @@ def documents_from_olx(
"mime_type": mimetype,
"archive_checksum": archive_checksum,
"file_extension": extension_lower,
- "source_path": f"{path}/{filename}",
+ "source_path": f"{path}/{filename.replace(' ', '_')}",
},
)
@@ -407,7 +409,126 @@ def text_from_sjson_content(content: str):
return " ".join(data.get("text", []))
+def get_root_url_for_source(etl_source: str) -> tuple[str, str]:
+ """
+ Get the base URL and path for an ETL source
+
+ Args:
+ etl_source (str): The ETL source path
+
+ Returns:
+ tuple[str, str]: The base URL and path
+ """
+ mapping = {
+ ETLSource.mitxonline.value: settings.CONTENT_BASE_URL_MITXONLINE,
+ ETLSource.xpro.value: settings.CONTENT_BASE_URL_XPRO,
+ ETLSource.oll.value: settings.CONTENT_BASE_URL_OLL,
+ ETLSource.mit_edx.value: settings.CONTENT_BASE_URL_EDX,
+ }
+ return mapping.get(etl_source)
+
+
+def is_valid_uuid(uuid_string: str) -> bool:
+ """
+ Check if a string is a valid UUID
+ """
+ try:
+ uuid.UUID(uuid_string)
+ except ValueError:
+ return False
+ return True
+
+
+def get_url_from_module_id(
+ module_id: str,
+ run: LearningResourceRun,
+ video_srt_metadata: Optional[dict] = None,
+) -> str:
+ """
+ Get the URL for a module based on its ID
+
+ Args:
+ module_id (str): The module ID
+ run (LearningResourceRun): The run associated with the module
+
+ Returns:
+ str: The URL for the module
+ """
+ if not module_id:
+ return None
+ root_url = get_root_url_for_source(run.learning_resource.etl_source)
+ # OLL needs to have 'course-v1:' added to the run_id
+ run_id = (
+ f"course-v1:{run.run_id}"
+ if run.learning_resource.etl_source == ETLSource.oll.value
+ else run.run_id
+ )
+ if module_id.startswith("asset"):
+ video_meta = video_srt_metadata.get(module_id, {}) if video_srt_metadata else {}
+ if video_meta:
+ # Link to the parent video
+ return f"{root_url}/courses/{run_id}/jump_to/{video_meta.split('@')[-1]}"
+ return f"{root_url}/{module_id}"
+ elif module_id.startswith("block") and is_valid_uuid(module_id.split("@")[-1]):
+ return f"{root_url}/courses/{run_id}/jump_to_id/{module_id.split('@')[-1]}"
+ else:
+ return None
+
+
+def parse_video_transcripts_xml(
+ run: LearningResourceRun, xml_content: str, path: Path
+) -> dict:
+ """
+ Parse video XML content and create a mapping of
+ transcript edx_module_id to video edx_module_id
+ """
+ transcript_mapping = {}
+ try:
+ root = ElementTree.fromstring(xml_content)
+
+ # Get the video url_name from the root video element
+ video_url_name = root.get("url_name")
+ if not video_url_name:
+ log.warning("No url_name found in video XML")
+ return {}
+
+ # Find all transcript elements and extract their src attributes
+ for transcript in root.findall(".//transcript"):
+ transcript_src = transcript.get("src")
+ if transcript_src:
+ transcript_mapping[
+ get_edx_module_id(f"static/{transcript_src}", run)
+ ] = get_edx_module_id(str(path), run)
+ except ElementTree.ParseError:
+ log.exception("Error parsing video XML for %s: %s", run, path)
+ return transcript_mapping
+
+
+def get_video_metadata(olx_path: str, run: LearningResourceRun) -> dict:
+ """
+ Get metadata for video SRT/VTT files in an OLX path
+ """
+ video_transcript_mapping = {}
+ video_path = Path(olx_path, "video")
+ if not video_path.exists():
+ log.debug("No video directory found in OLX path: %s", olx_path)
+ return video_transcript_mapping
+ for root, _, files in os.walk(str(Path(olx_path, "video"))):
+ for filename in files:
+ extension_lower = Path(filename).suffix.lower()
+ if extension_lower == ".xml":
+ with Path.open(Path(root, filename), "rb") as f:
+ video_xml = f.read().decode("utf-8")
+
+ # Parse the XML and get transcript mappings
+ transcript_mapping = parse_video_transcripts_xml(run, video_xml, f)
+ video_transcript_mapping.update(transcript_mapping)
+
+ return video_transcript_mapping
+
+
def _process_olx_path(olx_path: str, run: LearningResourceRun, *, overwrite):
+ video_srt_metadata = get_video_metadata(olx_path, run)
for document, metadata in documents_from_olx(olx_path):
source_path = metadata.get("source_path")
edx_module_id = get_edx_module_id(source_path, run)
@@ -465,6 +586,7 @@ def _process_olx_path(olx_path: str, run: LearningResourceRun, *, overwrite):
"file_extension": file_extension,
"source_path": source_path,
"edx_module_id": edx_module_id,
+ "url": get_url_from_module_id(edx_module_id, run, video_srt_metadata),
**content_dict,
}
)
@@ -741,7 +863,7 @@ def parse_certification(offeror, runs_data):
)
-def iso8601_duration(duration_str: str) -> str or None:
+def iso8601_duration(duration_str: str) -> str | None:
"""
Parse the duration from a string and return it in ISO-8601 format
@@ -821,7 +943,7 @@ def calculate_weeks(num: int, from_unit: str) -> int:
return num
-def transform_interval(interval_txt: str) -> str or None:
+def transform_interval(interval_txt: str) -> str | None:
"""
Transform any interval units to standard English units
Only languages currently supported are English and Spanish
diff --git a/learning_resources/etl/utils_test.py b/learning_resources/etl/utils_test.py
index a210bf4bd8..1ad8631436 100644
--- a/learning_resources/etl/utils_test.py
+++ b/learning_resources/etl/utils_test.py
@@ -224,6 +224,17 @@ def test_transform_content_files( # noqa: PLR0913
"learning_resources.etl.utils.extract_text_metadata", return_value=tika_output
)
+ # Mock the new functions called by _process_olx_path
+ video_metadata = {"test": "video"}
+ test_url = "https://example.com/test"
+
+ mocker.patch(
+ "learning_resources.etl.utils.get_video_metadata", return_value=video_metadata
+ )
+ mocker.patch(
+ "learning_resources.etl.utils.get_url_from_module_id", return_value=test_url
+ )
+
script_dir = (pathlib.Path(__file__).parent.absolute()).parent.parent
content = list(
@@ -248,6 +259,7 @@ def test_transform_content_files( # noqa: PLR0913
"file_extension": file_extension,
"source_path": f"root/{folder}/uuid{file_extension}",
"edx_module_id": edx_module_id,
+ "url": test_url,
}
]
else:
@@ -558,3 +570,217 @@ def test_parse_resource_commitment(raw_value, min_hours, max_hours):
assert utils.parse_resource_commitment(raw_value) == CommitmentConfig(
commitment=raw_value, min_weekly_hours=min_hours, max_weekly_hours=max_hours
)
+
+
+@pytest.mark.parametrize(
+ ("uuid_string", "expected"),
+ [
+ ("550e8400-e29b-41d4-a716-446655440000", True),
+ ("123e4567e89b12d3a456426614174000", True),
+ ("not-a-uuid", False),
+ ("", False),
+ ("123", False),
+ ("550e8400-e29b-41d4-a716", False), # too short
+ ("550e8400e29b41d4a71644665544000g", False), # invalid character
+ ],
+)
+def test_is_valid_uuid(uuid_string, expected):
+ """Test that is_valid_uuid correctly validates UUID strings"""
+ assert utils.is_valid_uuid(uuid_string) == expected
+
+
+@pytest.mark.parametrize(
+ ("xml_content", "file_name", "expected_mapping"),
+ [
+ (
+ """""",
+ "test_video.xml",
+ {
+ "asset-v1:test_run+type@asset+block@test_transcript.srt": "block-v1:test_run+type@video+block@test_video",
+ "asset-v1:test_run+type@asset+block@test_transcript_es.srt": "block-v1:test_run+type@video+block@test_video",
+ },
+ ),
+ (
+ """""",
+ "another_video.xml",
+ {
+ "asset-v1:test_run+type@asset+block@another_transcript.srt": "block-v1:test_run+type@video+block@another_video"
+ },
+ ),
+ (
+ """""",
+ "no_url_name.xml",
+ {}, # No url_name, should return empty dict
+ ),
+ (
+ """""",
+ "no_transcripts.xml",
+ {}, # No transcripts, should return empty dict
+ ),
+ (
+ """invalid xml content""",
+ "invalid.xml",
+ {}, # Invalid XML, should return empty dict
+ ),
+ ],
+)
+def test_parse_video_transcripts_xml(mocker, xml_content, file_name, expected_mapping):
+ """Test that parse_video_transcripts_xml correctly parses video XML and creates transcript mapping"""
+ run = LearningResourceRunFactory.create(run_id="course-v1:test_run")
+ path = mocker.Mock()
+ path.__str__ = mocker.Mock(return_value=f"video/{file_name}")
+
+ mock_log = mocker.patch("learning_resources.etl.utils.log")
+
+ result = utils.parse_video_transcripts_xml(run, xml_content, path)
+
+ assert result == expected_mapping
+
+ # Check if warning/exception was logged for invalid XML
+ if "invalid xml" in xml_content.lower():
+ mock_log.exception.assert_called_once()
+
+
+@pytest.mark.parametrize("video_dir_exists", [True, False])
+def test_get_video_metadata(mocker, tmp_path, video_dir_exists):
+ """Test that get_video_metadata correctly processes video directory and returns transcript mappings"""
+ run = LearningResourceRunFactory.create(run_id="course-v1:test_run")
+ olx_path = tmp_path / "course"
+ olx_path.mkdir()
+
+ if video_dir_exists:
+ video_dir = olx_path / "video"
+ video_dir.mkdir()
+
+ # Create a test video XML file
+ video_xml = """"""
+
+ video_file = video_dir / "test_video.xml"
+ video_file.write_text(video_xml)
+
+ # Mock parse_video_transcripts_xml to return expected mapping
+ expected_mapping = {
+ "asset-v1:test_run+type@asset+block@test_transcript1.srt": "block-v1:test_run+type@video+block@test_video",
+ "asset-v1:test_run+type@asset+block@test_transcript2.srt": "block-v1:test_run+type@video+block@test_video",
+ }
+ mock_parse = mocker.patch(
+ "learning_resources.etl.utils.parse_video_transcripts_xml",
+ return_value=expected_mapping,
+ )
+ result = utils.get_video_metadata(str(olx_path), run)
+
+ assert result == expected_mapping
+ assert mock_parse.call_count == 1
+ call_args = mock_parse.call_args[0]
+ assert call_args[0] == run
+ assert call_args[1] == video_xml
+ else:
+ # No video directory
+ assert utils.get_video_metadata(str(olx_path), run) == {}
+
+
+@pytest.mark.parametrize(
+ (
+ "etl_source",
+ "module_id",
+ "has_video_meta",
+ "expected_url_pattern",
+ ),
+ [
+ # Asset URLs
+ (
+ "mit_edx",
+ "asset-v1:test+type@asset+block@image.png",
+ False,
+ "https://edx.org/asset-v1:test+type@asset+block@image.png",
+ ),
+ (
+ "mit_edx",
+ "asset-v1:test+type@asset+block@video.mp4",
+ False,
+ "https://edx.org/asset-v1:test+type@asset+block@video.mp4",
+ ),
+ (
+ "mit_edx",
+ "asset-v1:test+type@asset+block@transcript.srt",
+ True,
+ "https://edx.org/courses/course-v1:test_run/jump_to/test_video",
+ ),
+ (
+ "mit_edx",
+ "asset-v1:test+type@asset+block@transcript.srt",
+ False,
+ "https://edx.org/asset-v1:test+type@asset+block@transcript.srt",
+ ), # SRT without video meta returns asset URL
+ # Block URLs with valid UUID
+ (
+ "mit_edx",
+ "block-v1:test+type@html+block@550e8400-e29b-41d4-a716-446655440000",
+ False,
+ "https://edx.org/courses/course-v1:test_run/jump_to_id/550e8400-e29b-41d4-a716-446655440000",
+ ),
+ # OLL source with run_id modification
+ (
+ "oll",
+ "block-v1:test+type@html+block@550e8400-e29b-41d4-a716-446655440000",
+ False,
+ "https://oll.org/courses/course-v1:course-v1:test_run/jump_to_id/550e8400-e29b-41d4-a716-446655440000",
+ ),
+ # Invalid cases
+ (
+ "",
+ "asset-v1:test+type@asset+block@file.txt",
+ False,
+ "None/asset-v1:test+type@asset+block@file.txt",
+ ), # Empty etl_source returns None as root_url
+ (
+ "mit_edx",
+ "block-v1:test+type@html+block@invalid-uuid",
+ False,
+ None,
+ ), # Invalid UUID
+ ("mit_edx", "unknown-format", False, None), # Unknown format
+ ],
+)
+def test_get_url_from_module_id(
+ settings,
+ etl_source,
+ module_id,
+ has_video_meta,
+ expected_url_pattern,
+):
+ """Test that get_url_from_module_id generates correct URLs for different module types"""
+ # Setup settings
+ settings.CONTENT_BASE_URL_EDX = "https://edx.org"
+ settings.CONTENT_BASE_URL_OLL = "https://oll.org"
+
+ run = LearningResourceRunFactory.create(
+ run_id="course-v1:test_run", learning_resource__etl_source=etl_source
+ )
+
+ # Setup metadata
+ video_srt_metadata = (
+ {
+ "asset-v1:test+type@asset+block@transcript.srt": "block-v1:test+type@video+block@test_video"
+ }
+ if has_video_meta
+ else None
+ )
+
+ result = utils.get_url_from_module_id(module_id, run, video_srt_metadata)
+
+ if expected_url_pattern:
+ assert result == expected_url_pattern
+ else:
+ assert result is None
diff --git a/main/settings_course_etl.py b/main/settings_course_etl.py
index a1bef5b17f..4b8ec0c6fd 100644
--- a/main/settings_course_etl.py
+++ b/main/settings_course_etl.py
@@ -105,3 +105,16 @@
TIKA_TIMEOUT = get_int("TIKA_TIMEOUT", 60)
TIKA_OCR_STRATEGY = get_string("TIKA_OCR_STRATEGY", "no_ocr")
SKIP_TIKA = get_bool("SKIP_TIKA", default=False)
+
+
+# Base content URLs for different sources
+CONTENT_BASE_URL_MITXONLINE = get_string(
+ "CONTENT_BASE_URL_MITXONLINE", "https://courses.mitxonline.mit.edu"
+)
+CONTENT_BASE_URL_XPRO = get_string(
+ "CONTENT_BASE_URL_XPRO", "https://courses.xpro.mit.edu"
+)
+CONTENT_BASE_URL_OLL = get_string(
+ "CONTENT_BASE_URL_OLL", "https://openlearninglibrary.mit.edu"
+)
+CONTENT_BASE_URL_EDX = get_string("CONTENT_BASE_URL_EDX", "https://courses.edx.org")