Skip to content

Commit 3d3e75c

Browse files
authored
add tutorbot problems model and etl (#2373)
1 parent 5ee3cdc commit 3d3e75c

File tree

9 files changed

+286
-1
lines changed

9 files changed

+286
-1
lines changed

app.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -704,6 +704,10 @@
704704
"POSTHOG_TIMEOUT_MS": {
705705
"description": "Timeout for communication with PostHog API",
706706
"required": false
707+
},
708+
"CANVAS_TUTORBOT_FOLDER": {
709+
"description": "Folder in Canvas course zip files where tutorbot problem and solution files are stored",
710+
"required": false
707711
}
708712
},
709713
"keywords": ["Django", "Python", "MIT", "Office of Digital Learning"],

learning_resources/constants.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,11 @@ class LearningResourceRelationTypes(TextChoices):
180180
zip(VALID_COURSE_CONTENT_TYPES, VALID_COURSE_CONTENT_TYPES)
181181
)
182182

183+
VALID_TUTOR_PROBLEM_TYPES = ["problem", "solution"]
184+
VALID_TUTOR_PROBLEM_TYPE_CHOICES = list(
185+
zip(VALID_TUTOR_PROBLEM_TYPES, VALID_TUTOR_PROBLEM_TYPES)
186+
)
187+
183188
DEPARTMENTS = {
184189
"1": "Civil and Environmental Engineering",
185190
"2": "Mechanical Engineering",

learning_resources/etl/canvas.py

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def sync_canvas_archive(bucket, key: str, overwrite):
2525
"""
2626
Sync a Canvas course archive from S3
2727
"""
28-
from learning_resources.etl.loaders import load_content_files
28+
from learning_resources.etl.loaders import load_content_files, load_problem_files
2929

3030
course_folder = key.lstrip(settings.CANVAS_COURSE_BUCKET_PREFIX).split("/")[0]
3131

@@ -43,8 +43,16 @@ def sync_canvas_archive(bucket, key: str, overwrite):
4343
course_archive_path, run, overwrite=overwrite
4444
),
4545
)
46+
47+
load_problem_files(
48+
run,
49+
transform_canvas_problem_files(
50+
course_archive_path, run, overwrite=overwrite
51+
),
52+
)
4653
run.checksum = checksum
4754
run.save()
55+
4856
return resource_readable_id, run
4957

5058

@@ -124,6 +132,42 @@ def transform_canvas_content_files(
124132
yield from _process_olx_path(olx_path, run, overwrite=overwrite)
125133

126134

135+
def transform_canvas_problem_files(
136+
course_zipfile: Path, run: LearningResourceRun, *, overwrite
137+
) -> Generator[dict, None, None]:
138+
"""
139+
Transform problem files from a Canvas course zipfile
140+
"""
141+
basedir = course_zipfile.name.split(".")[0]
142+
with (
143+
TemporaryDirectory(prefix=basedir) as olx_path,
144+
zipfile.ZipFile(course_zipfile.absolute(), "r") as course_archive,
145+
):
146+
for member in course_archive.infolist():
147+
if member.filename.startswith(settings.CANVAS_TUTORBOT_FOLDER):
148+
course_archive.extract(member, path=olx_path)
149+
log.debug("processing active problem set file %s", member.filename)
150+
for file_data in _process_olx_path(olx_path, run, overwrite=overwrite):
151+
keys_to_keep = [
152+
"run",
153+
"content",
154+
"archive_checksum",
155+
"source_path",
156+
"file_extension",
157+
]
158+
problem_file_data = {
159+
key: file_data[key] for key in keys_to_keep if key in file_data
160+
}
161+
path = file_data["source_path"]
162+
path = path[len(settings.CANVAS_TUTORBOT_FOLDER) :]
163+
path_parts = path.split("/")
164+
problem_file_data["problem_title"] = path_parts[0]
165+
166+
if path_parts[1] in ["problem", "solution"]:
167+
problem_file_data["type"] = path_parts[1]
168+
yield problem_file_data
169+
170+
127171
def parse_module_meta(course_archive_path: str) -> dict:
128172
"""
129173
Parse module_meta.xml and return publish/active status of resources.

learning_resources/etl/loaders.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
PodcastEpisode,
4242
Program,
4343
RunInstructorRelationship,
44+
TutorProblemFile,
4445
Video,
4546
VideoChannel,
4647
VideoPlaylist,
@@ -807,6 +808,63 @@ def load_content_files(
807808
return None
808809

809810

811+
def load_problem_file(
812+
course_run: LearningResourceRun, problem_file_data: dict
813+
) -> ContentFile:
814+
"""
815+
Sync a tutorbot problem to the database
816+
817+
Args:
818+
course_run (LearningResourceRun): a LearningResourceRun for a Course
819+
problem_file_data (dict): File metadata as JSON
820+
821+
Returns:
822+
Int: the id of the object that was created or updated
823+
"""
824+
try:
825+
problem_file, _ = TutorProblemFile.objects.update_or_create(
826+
run=course_run,
827+
source_path=problem_file_data.get("source_path"),
828+
defaults=problem_file_data,
829+
)
830+
return problem_file.id # noqa: TRY300
831+
except: # noqa: E722
832+
log.exception(
833+
"ERROR syncing problem file %s for run %d",
834+
problem_file_data.get("source_path", ""),
835+
course_run.id,
836+
)
837+
838+
839+
def load_problem_files(
840+
course_run: LearningResourceRun,
841+
problem_files_data: list[dict],
842+
) -> list[int]:
843+
"""
844+
Sync all problem files for canvas course
845+
846+
Args:
847+
course_run (LearningResourceRun): a course run
848+
problem_files_data (list or generator): Details about the problem files
849+
850+
Returns:
851+
list of int: Ids of the TutorProblemFile objects that were created/updated
852+
853+
"""
854+
problem_files_ids = [
855+
load_problem_file(course_run, problem_file)
856+
for problem_file in problem_files_data
857+
]
858+
for file in (
859+
TutorProblemFile.objects.filter(run=course_run)
860+
.exclude(id__in=problem_files_ids)
861+
.all()
862+
):
863+
file.delete()
864+
865+
return problem_files_ids
866+
867+
810868
def load_podcast_episode(episode_data: dict) -> LearningResource:
811869
"""
812870
Load a podcast_episode into the database

learning_resources/etl/loaders_test.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@
4343
load_podcast,
4444
load_podcast_episode,
4545
load_podcasts,
46+
load_problem_file,
47+
load_problem_files,
4648
load_program,
4749
load_programs,
4850
load_run,
@@ -81,6 +83,7 @@
8183
LearningResourceRun,
8284
PodcastEpisode,
8385
Program,
86+
TutorProblemFile,
8487
Video,
8588
VideoChannel,
8689
VideoPlaylist,
@@ -1094,6 +1097,67 @@ def test_load_content_file():
10941097
)
10951098

10961099

1100+
def test_load_problem_file():
1101+
"""Test that load_problem_file saves a TutorProblemFile object"""
1102+
learning_resource_run = LearningResourceRunFactory.create()
1103+
1104+
props = {
1105+
"problem_title": "Problem 1",
1106+
"type": "problem",
1107+
"source_path": "ai/tutor/problems/Problem 1/problem/problem1",
1108+
"content": "This is the content of the problem file.",
1109+
}
1110+
1111+
result = load_problem_file(learning_resource_run, props)
1112+
1113+
# assert we got an integer back
1114+
assert isinstance(result, int)
1115+
1116+
assert TutorProblemFile.objects.count() == 1
1117+
1118+
loaded_file = TutorProblemFile.objects.get(pk=result)
1119+
assert loaded_file.run == learning_resource_run
1120+
1121+
for key, value in props.items():
1122+
assert getattr(loaded_file, key) == value, (
1123+
f"Property {key} should equal {value}"
1124+
)
1125+
1126+
1127+
def test_load_problem_files(mocker):
1128+
"""Test that load_content_files calls the expected functions"""
1129+
course = LearningResourceFactory.create(is_course=True, create_runs=False)
1130+
course_run = LearningResourceRunFactory.create(learning_resource=course)
1131+
LearningResourceRunFactory.create(
1132+
learning_resource=course,
1133+
start_date=now_in_utc() - timedelta(days=365),
1134+
)
1135+
assert course.runs.count() == 2
1136+
1137+
deleted_problem_file = ContentFileFactory.create(run=course_run)
1138+
1139+
content_data = [
1140+
{
1141+
"problem_title": "Problem 1",
1142+
"type": "problem",
1143+
"source_path": "ai/tutor/problems/Problem 1/problem/problem1",
1144+
},
1145+
{
1146+
"problem_title": "Problem 1",
1147+
"type": "solution",
1148+
"source_path": "ai/tutor/problems/Problem 1/solution/sol1",
1149+
},
1150+
]
1151+
1152+
load_problem_files(course_run, content_data)
1153+
1154+
assert TutorProblemFile.objects.filter(id=deleted_problem_file.id).exists() is False
1155+
for file in content_data:
1156+
assert TutorProblemFile.objects.filter(
1157+
run=course_run, source_path=file["source_path"]
1158+
).exists()
1159+
1160+
10971161
def test_load_image():
10981162
"""Test that image resources are uniquely created or retrieved based on parameters"""
10991163
resource_url = "https://mit.edu"

learning_resources/factories.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -878,6 +878,25 @@ class Meta:
878878
skip_postgeneration_save = True
879879

880880

881+
class TutorProblemFileFactory(DjangoModelFactory):
882+
"""Factory for TutorProblemFiles"""
883+
884+
run = None
885+
problem_title = factory.Faker("sentence")
886+
type = FuzzyChoice("problem", "solution")
887+
content = factory.Faker("text")
888+
source_path = factory.Faker("file_path", extension="txt")
889+
890+
@classmethod
891+
def _create(cls, model_class, *args, **kwargs):
892+
run = kwargs.pop("run", None)
893+
kwargs["run"] = run
894+
return super()._create(model_class, *args, **kwargs)
895+
896+
class Meta:
897+
model = models.TutorProblemFile
898+
899+
881900
class VideoPlaylistFactory(DjangoModelFactory):
882901
"""Factory for Video Playlists"""
883902

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# Generated by Django 4.2.23 on 2025-07-18 23:26
2+
3+
import django.db.models.deletion
4+
from django.db import migrations, models
5+
6+
7+
class Migration(migrations.Migration):
8+
dependencies = [
9+
("learning_resources", "0091_content_file_content_view_group"),
10+
]
11+
12+
operations = [
13+
migrations.CreateModel(
14+
name="TutorProblemFile",
15+
fields=[
16+
(
17+
"id",
18+
models.AutoField(
19+
auto_created=True,
20+
primary_key=True,
21+
serialize=False,
22+
verbose_name="ID",
23+
),
24+
),
25+
("created_on", models.DateTimeField(auto_now_add=True, db_index=True)),
26+
("updated_on", models.DateTimeField(auto_now=True)),
27+
(
28+
"problem_title",
29+
models.CharField(blank=True, max_length=1024, null=True),
30+
),
31+
(
32+
"type",
33+
models.CharField(
34+
choices=[("problem", "problem"), ("solution", "solution")],
35+
max_length=128,
36+
),
37+
),
38+
("content", models.TextField(blank=True, null=True)),
39+
(
40+
"archive_checksum",
41+
models.CharField(blank=True, max_length=32, null=True),
42+
),
43+
(
44+
"source_path",
45+
models.CharField(blank=True, max_length=1024, null=True),
46+
),
47+
(
48+
"file_extension",
49+
models.CharField(blank=True, max_length=32, null=True),
50+
),
51+
(
52+
"run",
53+
models.ForeignKey(
54+
on_delete=django.db.models.deletion.CASCADE,
55+
related_name="problem_files",
56+
to="learning_resources.learningresourcerun",
57+
),
58+
),
59+
],
60+
options={
61+
"abstract": False,
62+
},
63+
),
64+
]

learning_resources/models.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -861,6 +861,32 @@ def for_serialization(self):
861861
)
862862

863863

864+
class TutorProblemFile(TimestampedModel):
865+
"""
866+
Tutor Problem and Solution model
867+
"""
868+
869+
run = models.ForeignKey(
870+
LearningResourceRun,
871+
related_name="problem_files",
872+
on_delete=models.CASCADE,
873+
blank=False,
874+
null=False,
875+
)
876+
877+
problem_title = models.CharField(max_length=1024, null=True, blank=True) # noqa: DJ001
878+
879+
type = models.CharField(
880+
max_length=128, choices=constants.VALID_TUTOR_PROBLEM_TYPE_CHOICES
881+
)
882+
883+
content = models.TextField(null=True, blank=True) # noqa: DJ001
884+
885+
archive_checksum = models.CharField(max_length=32, null=True, blank=True) # noqa: DJ001
886+
source_path = models.CharField(max_length=1024, null=True, blank=True) # noqa: DJ001
887+
file_extension = models.CharField(max_length=32, null=True, blank=True) # noqa: DJ001
888+
889+
864890
class ContentFile(TimestampedModel):
865891
"""
866892
ContentFile model for LearningResourceRun files

main/settings.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -850,3 +850,4 @@ def get_all_config_keys():
850850
OPENTELEMETRY_ENDPOINT = get_string("OPENTELEMETRY_ENDPOINT", None)
851851
OPENTELEMETRY_TRACES_BATCH_SIZE = get_int("OPENTELEMETRY_TRACES_BATCH_SIZE", 512)
852852
OPENTELEMETRY_EXPORT_TIMEOUT_MS = get_int("OPENTELEMETRY_EXPORT_TIMEOUT_MS", 5000)
853+
CANVAS_TUTORBOT_FOLDER = get_string("CANVAS_TUTORBOT_FOLDER", "web_resources/ai/tutor/")

0 commit comments

Comments
 (0)