|
18 | 18 | from pathlib import Path
|
19 | 19 | from subprocess import check_call
|
20 | 20 | from tempfile import TemporaryDirectory
|
| 21 | +from typing import Optional |
21 | 22 |
|
22 | 23 | import boto3
|
23 | 24 | import rapidjson
|
24 | 25 | import requests
|
| 26 | +from defusedxml.ElementTree import ParseError, parse |
25 | 27 | from django.conf import settings
|
26 | 28 | from django.utils.dateparse import parse_duration
|
27 | 29 | from django.utils.text import slugify
|
@@ -341,7 +343,7 @@ def documents_from_olx(
|
341 | 343 | "mime_type": mimetype,
|
342 | 344 | "archive_checksum": archive_checksum,
|
343 | 345 | "file_extension": extension_lower,
|
344 |
| - "source_path": f"{path}/{filename}", |
| 346 | + "source_path": f"{path}/{filename.replace(' ', '_')}", |
345 | 347 | },
|
346 | 348 | )
|
347 | 349 |
|
@@ -407,7 +409,150 @@ def text_from_sjson_content(content: str):
|
407 | 409 | return " ".join(data.get("text", []))
|
408 | 410 |
|
409 | 411 |
|
| 412 | +def get_root_url_for_source(etl_source: str) -> tuple[str, str]: |
| 413 | + """ |
| 414 | + Get the base URL and path for an ETL source |
| 415 | +
|
| 416 | + Args: |
| 417 | + etl_source (str): The ETL source path |
| 418 | +
|
| 419 | + Returns: |
| 420 | + tuple[str, str]: The base URL and path |
| 421 | + """ |
| 422 | + mapping = { |
| 423 | + ETLSource.mitxonline.value: "https://courses.mitxonline.mit.edu", |
| 424 | + ETLSource.xpro.value: "https://courses.xpro.mit.edu", |
| 425 | + ETLSource.mit_edx.value: "https://www.edx.org", |
| 426 | + ETLSource.oll.value: "https://openlearninglibrary.mit.edu", |
| 427 | + } |
| 428 | + return mapping.get(etl_source) |
| 429 | + |
| 430 | + |
| 431 | +def is_valid_uuid(uuid_string: str) -> bool: |
| 432 | + """ |
| 433 | + Check if a string is a valid UUID |
| 434 | + """ |
| 435 | + try: |
| 436 | + uuid.UUID(uuid_string) |
| 437 | + except ValueError: |
| 438 | + return False |
| 439 | + return True |
| 440 | + |
| 441 | + |
| 442 | +def get_url_from_module_id( |
| 443 | + olx_path: str, |
| 444 | + module_id: str, |
| 445 | + run: LearningResourceRun, |
| 446 | + assets_metadata: Optional[dict] = None, |
| 447 | + video_srt_metadata: Optional[dict] = None, |
| 448 | +) -> str: |
| 449 | + """ |
| 450 | + Get the URL for a module based on its ID |
| 451 | +
|
| 452 | + Args: |
| 453 | + module_id (str): The module ID |
| 454 | + run (LearningResourceRun): The run associated with the module |
| 455 | +
|
| 456 | + Returns: |
| 457 | + str: The URL for the module |
| 458 | + """ |
| 459 | + if not module_id: |
| 460 | + log.warning("Module ID is empty") |
| 461 | + return None |
| 462 | + root_url = get_root_url_for_source(run.learning_resource.etl_source) |
| 463 | + with Path.open("video_metadata.json", "w") as f: |
| 464 | + json.dump(video_srt_metadata, f, indent=2) |
| 465 | + if module_id.startswith("asset"): |
| 466 | + log.info("Getting URL for asset %s", module_id) |
| 467 | + asset_meta = ( |
| 468 | + assets_metadata.get(Path(olx_path).parts[-1], {}) if assets_metadata else {} |
| 469 | + ) |
| 470 | + video_meta = video_srt_metadata.get(module_id, {}) if video_srt_metadata else {} |
| 471 | + if video_meta: |
| 472 | + log.info("Found video metadata for %s", module_id) |
| 473 | + return f"{root_url}/xblock/{video_meta}" |
| 474 | + elif module_id.endswith(".srt"): |
| 475 | + log.info("NO VIDEO METADATA FOR %s", module_id) |
| 476 | + middle_path = asset_meta.get("custom_md5", "") |
| 477 | + return f"{root_url}/{(middle_path + '/') if middle_path else ''}{module_id}" |
| 478 | + elif module_id.startswith("block") and is_valid_uuid(module_id.split("@")[-1]): |
| 479 | + return f"{root_url}/xblock/{module_id}" |
| 480 | + else: |
| 481 | + log.warning("Unknown module ID format: %s", module_id) |
| 482 | + return None |
| 483 | + |
| 484 | + |
| 485 | +def get_assets_metadata(olx_path: str) -> dict: |
| 486 | + """ |
| 487 | + Get metadata for assets in an OLX path |
| 488 | +
|
| 489 | + Args: |
| 490 | + olx_path (str): The path to the OLX directory |
| 491 | + """ |
| 492 | + try: |
| 493 | + with Path.open(Path(olx_path, "policies/assets.json"), "rb") as f: |
| 494 | + return json.loads(f.read()) |
| 495 | + except FileNotFoundError: |
| 496 | + log.warning("Assets metadata file does not exist: %s", olx_path) |
| 497 | + |
| 498 | + |
| 499 | +def parse_video_transcripts_xml( |
| 500 | + run: LearningResourceRun, xml_content: str, path: Path |
| 501 | +) -> dict: |
| 502 | + """ |
| 503 | + Parse video XML content and create a mapping of |
| 504 | + transcript edx_module_id to video edx_module_id |
| 505 | + """ |
| 506 | + transcript_mapping = {} |
| 507 | + try: |
| 508 | + root = parse(xml_content) |
| 509 | + |
| 510 | + # Get the video url_name from the root video element |
| 511 | + video_url_name = root.get("url_name") |
| 512 | + if not video_url_name: |
| 513 | + log.warning("No url_name found in video XML") |
| 514 | + return {} |
| 515 | + |
| 516 | + # Find all transcript elements and extract their src attributes |
| 517 | + for transcript in root.findall(".//transcript"): |
| 518 | + transcript_src = transcript.get("src") |
| 519 | + if transcript_src: |
| 520 | + transcript_mapping[ |
| 521 | + get_edx_module_id(f"static/{transcript_src}", run) |
| 522 | + ] = get_edx_module_id(str(path), run) |
| 523 | + except ParseError: |
| 524 | + log.exception("Error parsing video XML for %s: %s", run, path) |
| 525 | + return transcript_mapping |
| 526 | + |
| 527 | + |
| 528 | +def get_video_metadata(olx_path: str, run: LearningResourceRun) -> dict: |
| 529 | + """ |
| 530 | + Get metadata for video SRT files in an OLX path |
| 531 | + """ |
| 532 | + video_transcript_mapping = {} |
| 533 | + video_path = Path(olx_path, "video") |
| 534 | + if not video_path.exists(): |
| 535 | + log.warning("No video directory found in OLX path: %s", olx_path) |
| 536 | + return video_transcript_mapping |
| 537 | + for root, _, files in os.walk(str(Path(olx_path, "video"))): |
| 538 | + path = "/".join(root.split("/")[3:]) |
| 539 | + for filename in files: |
| 540 | + log.info("Processing video file %s in %s", filename, path) |
| 541 | + extension_lower = Path(filename).suffix.lower() |
| 542 | + if extension_lower == ".xml": |
| 543 | + with Path.open(Path(root, filename), "rb") as f: |
| 544 | + video_xml = f.read().decode("utf-8") |
| 545 | + |
| 546 | + # Parse the XML and get transcript mappings |
| 547 | + transcript_mapping = parse_video_transcripts_xml(run, video_xml, f) |
| 548 | + video_transcript_mapping.update(transcript_mapping) |
| 549 | + |
| 550 | + return video_transcript_mapping |
| 551 | + |
| 552 | + |
410 | 553 | def _process_olx_path(olx_path: str, run: LearningResourceRun, *, overwrite):
|
| 554 | + assets_metadata = get_assets_metadata(olx_path) |
| 555 | + video_srt_metadata = get_video_metadata(olx_path, run) |
411 | 556 | for document, metadata in documents_from_olx(olx_path):
|
412 | 557 | source_path = metadata.get("source_path")
|
413 | 558 | edx_module_id = get_edx_module_id(source_path, run)
|
@@ -465,6 +610,9 @@ def _process_olx_path(olx_path: str, run: LearningResourceRun, *, overwrite):
|
465 | 610 | "file_extension": file_extension,
|
466 | 611 | "source_path": source_path,
|
467 | 612 | "edx_module_id": edx_module_id,
|
| 613 | + "url": get_url_from_module_id( |
| 614 | + source_path, edx_module_id, run, assets_metadata, video_srt_metadata |
| 615 | + ), |
468 | 616 | **content_dict,
|
469 | 617 | }
|
470 | 618 | )
|
@@ -741,7 +889,7 @@ def parse_certification(offeror, runs_data):
|
741 | 889 | )
|
742 | 890 |
|
743 | 891 |
|
744 |
| -def iso8601_duration(duration_str: str) -> str or None: |
| 892 | +def iso8601_duration(duration_str: str) -> str | None: |
745 | 893 | """
|
746 | 894 | Parse the duration from a string and return it in ISO-8601 format
|
747 | 895 |
|
@@ -821,7 +969,7 @@ def calculate_weeks(num: int, from_unit: str) -> int:
|
821 | 969 | return num
|
822 | 970 |
|
823 | 971 |
|
824 |
| -def transform_interval(interval_txt: str) -> str or None: |
| 972 | +def transform_interval(interval_txt: str) -> str | None: |
825 | 973 | """
|
826 | 974 | Transform any interval units to standard English units
|
827 | 975 | Only languages currently supported are English and Spanish
|
|
0 commit comments