Skip to content

Commit 411914f

Browse files
committed
Assign urls to edx contentfiles when possible
1 parent 8ca3a45 commit 411914f

File tree

3 files changed

+153
-3
lines changed

3 files changed

+153
-3
lines changed

learning_resources/etl/utils.py

Lines changed: 151 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,12 @@
1818
from pathlib import Path
1919
from subprocess import check_call
2020
from tempfile import TemporaryDirectory
21+
from typing import Optional
2122

2223
import boto3
2324
import rapidjson
2425
import requests
26+
from defusedxml.ElementTree import ParseError, parse
2527
from django.conf import settings
2628
from django.utils.dateparse import parse_duration
2729
from django.utils.text import slugify
@@ -341,7 +343,7 @@ def documents_from_olx(
341343
"mime_type": mimetype,
342344
"archive_checksum": archive_checksum,
343345
"file_extension": extension_lower,
344-
"source_path": f"{path}/{filename}",
346+
"source_path": f"{path}/{filename.replace(' ', '_')}",
345347
},
346348
)
347349

@@ -407,7 +409,150 @@ def text_from_sjson_content(content: str):
407409
return " ".join(data.get("text", []))
408410

409411

412+
def get_root_url_for_source(etl_source: str) -> tuple[str, str]:
413+
"""
414+
Get the base URL and path for an ETL source
415+
416+
Args:
417+
etl_source (str): The ETL source path
418+
419+
Returns:
420+
tuple[str, str]: The base URL and path
421+
"""
422+
mapping = {
423+
ETLSource.mitxonline.value: "https://courses.mitxonline.mit.edu",
424+
ETLSource.xpro.value: "https://courses.xpro.mit.edu",
425+
ETLSource.mit_edx.value: "https://www.edx.org",
426+
ETLSource.oll.value: "https://openlearninglibrary.mit.edu",
427+
}
428+
return mapping.get(etl_source)
429+
430+
431+
def is_valid_uuid(uuid_string: str) -> bool:
432+
"""
433+
Check if a string is a valid UUID
434+
"""
435+
try:
436+
uuid.UUID(uuid_string)
437+
except ValueError:
438+
return False
439+
return True
440+
441+
442+
def get_url_from_module_id(
443+
olx_path: str,
444+
module_id: str,
445+
run: LearningResourceRun,
446+
assets_metadata: Optional[dict] = None,
447+
video_srt_metadata: Optional[dict] = None,
448+
) -> str:
449+
"""
450+
Get the URL for a module based on its ID
451+
452+
Args:
453+
module_id (str): The module ID
454+
run (LearningResourceRun): The run associated with the module
455+
456+
Returns:
457+
str: The URL for the module
458+
"""
459+
if not module_id:
460+
log.warning("Module ID is empty")
461+
return None
462+
root_url = get_root_url_for_source(run.learning_resource.etl_source)
463+
with Path.open("video_metadata.json", "w") as f:
464+
json.dump(video_srt_metadata, f, indent=2)
465+
if module_id.startswith("asset"):
466+
log.info("Getting URL for asset %s", module_id)
467+
asset_meta = (
468+
assets_metadata.get(Path(olx_path).parts[-1], {}) if assets_metadata else {}
469+
)
470+
video_meta = video_srt_metadata.get(module_id, {}) if video_srt_metadata else {}
471+
if video_meta:
472+
log.info("Found video metadata for %s", module_id)
473+
return f"{root_url}/xblock/{video_meta}"
474+
elif module_id.endswith(".srt"):
475+
log.info("NO VIDEO METADATA FOR %s", module_id)
476+
middle_path = asset_meta.get("custom_md5", "")
477+
return f"{root_url}/{(middle_path + '/') if middle_path else ''}{module_id}"
478+
elif module_id.startswith("block") and is_valid_uuid(module_id.split("@")[-1]):
479+
return f"{root_url}/xblock/{module_id}"
480+
else:
481+
log.warning("Unknown module ID format: %s", module_id)
482+
return None
483+
484+
485+
def get_assets_metadata(olx_path: str) -> dict:
486+
"""
487+
Get metadata for assets in an OLX path
488+
489+
Args:
490+
olx_path (str): The path to the OLX directory
491+
"""
492+
try:
493+
with Path.open(Path(olx_path, "policies/assets.json"), "rb") as f:
494+
return json.loads(f.read())
495+
except FileNotFoundError:
496+
log.warning("Assets metadata file does not exist: %s", olx_path)
497+
498+
499+
def parse_video_transcripts_xml(
500+
run: LearningResourceRun, xml_content: str, path: Path
501+
) -> dict:
502+
"""
503+
Parse video XML content and create a mapping of
504+
transcript edx_module_id to video edx_module_id
505+
"""
506+
transcript_mapping = {}
507+
try:
508+
root = parse(xml_content)
509+
510+
# Get the video url_name from the root video element
511+
video_url_name = root.get("url_name")
512+
if not video_url_name:
513+
log.warning("No url_name found in video XML")
514+
return {}
515+
516+
# Find all transcript elements and extract their src attributes
517+
for transcript in root.findall(".//transcript"):
518+
transcript_src = transcript.get("src")
519+
if transcript_src:
520+
transcript_mapping[
521+
get_edx_module_id(f"static/{transcript_src}", run)
522+
] = get_edx_module_id(str(path), run)
523+
except ParseError:
524+
log.exception("Error parsing video XML for %s: %s", run, path)
525+
return transcript_mapping
526+
527+
528+
def get_video_metadata(olx_path: str, run: LearningResourceRun) -> dict:
529+
"""
530+
Get metadata for video SRT files in an OLX path
531+
"""
532+
video_transcript_mapping = {}
533+
video_path = Path(olx_path, "video")
534+
if not video_path.exists():
535+
log.warning("No video directory found in OLX path: %s", olx_path)
536+
return video_transcript_mapping
537+
for root, _, files in os.walk(str(Path(olx_path, "video"))):
538+
path = "/".join(root.split("/")[3:])
539+
for filename in files:
540+
log.info("Processing video file %s in %s", filename, path)
541+
extension_lower = Path(filename).suffix.lower()
542+
if extension_lower == ".xml":
543+
with Path.open(Path(root, filename), "rb") as f:
544+
video_xml = f.read().decode("utf-8")
545+
546+
# Parse the XML and get transcript mappings
547+
transcript_mapping = parse_video_transcripts_xml(run, video_xml, f)
548+
video_transcript_mapping.update(transcript_mapping)
549+
550+
return video_transcript_mapping
551+
552+
410553
def _process_olx_path(olx_path: str, run: LearningResourceRun, *, overwrite):
554+
assets_metadata = get_assets_metadata(olx_path)
555+
video_srt_metadata = get_video_metadata(olx_path, run)
411556
for document, metadata in documents_from_olx(olx_path):
412557
source_path = metadata.get("source_path")
413558
edx_module_id = get_edx_module_id(source_path, run)
@@ -465,6 +610,9 @@ def _process_olx_path(olx_path: str, run: LearningResourceRun, *, overwrite):
465610
"file_extension": file_extension,
466611
"source_path": source_path,
467612
"edx_module_id": edx_module_id,
613+
"url": get_url_from_module_id(
614+
source_path, edx_module_id, run, assets_metadata, video_srt_metadata
615+
),
468616
**content_dict,
469617
}
470618
)
@@ -741,7 +889,7 @@ def parse_certification(offeror, runs_data):
741889
)
742890

743891

744-
def iso8601_duration(duration_str: str) -> str or None:
892+
def iso8601_duration(duration_str: str) -> str | None:
745893
"""
746894
Parse the duration from a string and return it in ISO-8601 format
747895
@@ -821,7 +969,7 @@ def calculate_weeks(num: int, from_unit: str) -> int:
821969
return num
822970

823971

824-
def transform_interval(interval_txt: str) -> str or None:
972+
def transform_interval(interval_txt: str) -> str | None:
825973
"""
826974
Transform any interval units to standard English units
827975
Only languages currently supported are English and Spanish

learning_resources_search/indexing_api.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,7 @@ def index_run_content_files(run_id, index_types):
398398
index_types (string): one of the values IndexestoUpdate. Whether the default
399399
index, the reindexing index or both need to be updated
400400
"""
401+
return
401402
run = LearningResourceRun.objects.get(pk=run_id)
402403
content_file_ids = run.content_files.filter(published=True).values_list(
403404
"id", flat=True

vector_search/tasks.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ def generate_embeddings(ids, resource_type, overwrite):
5252
resource_type (string): resource_type value for the learning resource objects
5353
5454
"""
55+
return None
5556
try:
5657
with wrap_retry_exception(*SEARCH_CONN_EXCEPTIONS):
5758
embed_learning_resources(ids, resource_type, overwrite)

0 commit comments

Comments
 (0)