Skip to content

Commit 28dccfe

Browse files
authored
Add separate field for ocw topics, use best field to assign related topics (#1600)
1 parent 58b8503 commit 28dccfe

File tree

10 files changed

+450
-12
lines changed

10 files changed

+450
-12
lines changed

frontends/api/src/generated/v1/api.ts

Lines changed: 186 additions & 0 deletions
Large diffs are not rendered by default.

learning_resources/etl/ocw.py

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,40 @@ def parse_delivery(course_data: dict) -> list[str]:
7171
return delivery
7272

7373

74+
def parse_learn_topics(course_data: dict) -> list[dict]:
75+
"""
76+
Parse topics. Use the "mit_learn_topics" field if it exists and isn't empty,
77+
otherwise use and transform the "topics" field values.
78+
79+
Args:
80+
course_data (dict): The course data
81+
82+
Returns:
83+
list[dict]: The topics
84+
"""
85+
mitlearn_topics = course_data.get("mit_learn_topics") or []
86+
ocw_topics = course_data.get("topics") or []
87+
if mitlearn_topics:
88+
# Should already be in the correct format
89+
return [
90+
{"name": topic_name}
91+
for topic_name in sorted(
92+
{topic for topics in mitlearn_topics for topic in topics}
93+
)
94+
]
95+
else:
96+
# Topics need to be transformed
97+
return transform_topics(
98+
[
99+
{"name": topic_name}
100+
for topic_name in sorted(
101+
{topic for topics in ocw_topics for topic in topics}
102+
)
103+
],
104+
OFFERED_BY["code"],
105+
)
106+
107+
74108
def transform_content_files(
75109
s3_resource: boto3.resource,
76110
course_prefix: str,
@@ -329,10 +363,6 @@ def transform_course(course_data: dict) -> dict:
329363
readable_term = f"+{slugify(term)}" if term else ""
330364
readable_year = f"_{course_data.get('year')}" if year else ""
331365
readable_id = f"{course_data[PRIMARY_COURSE_ID]}{readable_term}{readable_year}"
332-
topics = transform_topics(
333-
[{"name": topic} for topics in course_data.get("topics") for topic in topics],
334-
OFFERED_BY["code"],
335-
)
336366
image_src = course_data.get("image_src")
337367

338368
return {
@@ -365,7 +395,14 @@ def transform_course(course_data: dict) -> dict:
365395
is_ocw=True,
366396
),
367397
},
368-
"topics": topics,
398+
"topics": parse_learn_topics(course_data),
399+
"ocw_topics": sorted(
400+
{
401+
topic_name
402+
for topic_sublist in course_data.get("topics", [])
403+
for topic_name in topic_sublist
404+
}
405+
),
369406
"runs": [transform_run(course_data)],
370407
"resource_type": LearningResourceType.course.name,
371408
"unique_field": UNIQUE_FIELD,

learning_resources/etl/ocw_test.py

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,15 @@
1818
)
1919
from learning_resources.etl.constants import CourseNumberType, ETLSource
2020
from learning_resources.etl.ocw import (
21+
parse_learn_topics,
2122
transform_content_files,
2223
transform_contentfile,
2324
transform_course,
2425
)
25-
from learning_resources.factories import ContentFileFactory
26+
from learning_resources.factories import (
27+
ContentFileFactory,
28+
LearningResourceTopicFactory,
29+
)
2630
from learning_resources.models import ContentFile
2731
from learning_resources.utils import (
2832
get_s3_object_and_read,
@@ -239,6 +243,14 @@ def test_transform_course( # noqa: PLR0913
239243
)
240244
transformed_json = transform_course(extracted_json)
241245
if expected_uid:
246+
assert transformed_json["ocw_topics"] == [
247+
"Anthropology",
248+
"Ethnography",
249+
"Humanities",
250+
"Philosophy",
251+
"Political Philosophy",
252+
"Social Science",
253+
]
242254
assert transformed_json["readable_id"] == expected_id
243255
assert transformed_json["etl_source"] == ETLSource.ocw.name
244256
assert transformed_json["delivery"] == expected_delivery
@@ -295,3 +307,45 @@ def test_transform_course( # noqa: PLR0913
295307
)
296308
else:
297309
assert transformed_json is None
310+
311+
312+
@pytest.mark.parametrize("has_learn_topics", [True, False])
313+
def test_parse_topics(mocker, has_learn_topics):
314+
"""Topics should be assigned correctly based on mitlearn topics if present, ocw topics if not"""
315+
ocw_topics = [
316+
["Social Science", "Anthropology", "Ethnography"],
317+
["Social Science", "Political Science", "International Relations"],
318+
]
319+
mit_learn_topics = (
320+
[["Social Sciences", "Anthropology"], ["Social Sciences", "Political Science"]]
321+
if has_learn_topics
322+
else []
323+
)
324+
course_data = {
325+
"topics": ocw_topics,
326+
"mit_learn_topics": mit_learn_topics,
327+
}
328+
mocker.patch(
329+
"learning_resources.etl.utils.load_offeror_topic_map",
330+
return_value={
331+
"Political Philosophy": ["Philosophy"],
332+
"Ethnography": ["Anthropology"],
333+
"International Relations": ["Political Science"],
334+
},
335+
)
336+
for topic in ("Social Sciences", "Anthropology", "Political Science"):
337+
LearningResourceTopicFactory.create(name=topic)
338+
topics_dict = parse_learn_topics(course_data)
339+
if has_learn_topics:
340+
assert topics_dict == [
341+
{"name": "Anthropology"},
342+
{"name": "Political Science"},
343+
{"name": "Social Sciences"},
344+
]
345+
else:
346+
assert topics_dict == [
347+
{"name": "Anthropology"},
348+
{"name": "Anthropology"},
349+
{"name": "Political Science"},
350+
{"name": "Political Science"},
351+
]
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Generated by Django 4.2.16 on 2024-09-23 18:04
2+
3+
import django.contrib.postgres.fields
4+
from django.db import migrations, models
5+
6+
7+
class Migration(migrations.Migration):
8+
dependencies = [
9+
("learning_resources", "0068_learningresource_format_pace"),
10+
]
11+
12+
operations = [
13+
migrations.AddField(
14+
model_name="learningresource",
15+
name="ocw_topics",
16+
field=django.contrib.postgres.fields.ArrayField(
17+
base_field=models.CharField(max_length=128),
18+
blank=True,
19+
default=list,
20+
size=None,
21+
),
22+
),
23+
]

learning_resources/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,7 @@ class LearningResource(TimestampedModel):
411411
choices=((member.name, member.value) for member in LearningResourceType),
412412
)
413413
topics = models.ManyToManyField(LearningResourceTopic)
414+
ocw_topics = ArrayField(models.CharField(max_length=128), default=list, blank=True)
414415
offered_by = models.ForeignKey(
415416
LearningResourceOfferor, null=True, on_delete=models.SET_NULL
416417
)

learning_resources/serializers_test.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,7 @@ def test_learning_resource_serializer( # noqa: PLR0913
254254
serializers.LearningResourceTopicSerializer(topic).data
255255
for topic in resource.topics.all()
256256
],
257+
"ocw_topics": sorted(resource.ocw_topics),
257258
"runs": [
258259
serializers.LearningResourceRunSerializer(instance=run).data
259260
for run in resource.runs.all()

learning_resources_search/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ class FilterConfig:
7171
"run_id": FilterConfig("run_id", case_sensitive=True),
7272
"resource_id": FilterConfig("resource_id", case_sensitive=True),
7373
"topic": FilterConfig("topics.name"),
74+
"ocw_topic": FilterConfig("ocw_topics"),
7475
"level": FilterConfig("runs.level.code"),
7576
"department": FilterConfig("departments.department_id"),
7677
"platform": FilterConfig("platform.code"),
@@ -184,6 +185,7 @@ class FilterConfig:
184185
"channel_url": {"type": "keyword"},
185186
},
186187
},
188+
"ocw_topics": {"type": "keyword"},
187189
"offered_by": {
188190
"type": "nested",
189191
"properties": {

learning_resources_search/serializers.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,11 @@ class SearchRequestSerializer(serializers.Serializer):
276276
child=serializers.CharField(),
277277
help_text="The topic name. To see a list of options go to api/v1/topics/",
278278
)
279+
ocw_topic = serializers.ListField(
280+
required=False,
281+
child=serializers.CharField(),
282+
help_text="The ocw topic name.",
283+
)
279284
dev_mode = serializers.BooleanField(
280285
required=False,
281286
allow_null=True,

0 commit comments

Comments
 (0)