optimed best_match algorithm by adding eagerloading to reduce number of required db-queries

kkoehn · kkoehn · commit 745611351ad2 · 2025-10-14T23:18:44.000+02:00
diff --git a/app/models/proxy_exercise.rb b/app/models/proxy_exercise.rb
@@ -78,13 +78,18 @@ def get_matching_exercise(user)
   end
 
   def find_matching_exercise(user)
-    exercises_user_has_accessed = user.submissions.where(cause: %w[submit assess remoteSubmit remoteAssess]).map(&:exercise).uniq.compact
-    tags_user_has_seen = exercises_user_has_accessed.map(&:tags).uniq.flatten
+    exercises_user_has_accessed =
+      Exercise
+        .joins(:submissions)
+        .merge(user.submissions.where(cause: %w[submit assess remoteSubmit remoteAssess]))
+        .includes(:tags, :files)
+        .distinct
+    tags_user_has_seen = exercises_user_has_accessed.flat_map(&:tags).uniq
     Rails.logger.debug { "exercises_user_has_accessed #{exercises_user_has_accessed.map(&:id).join(',')}" }
 
     # find exercises
     potential_recommended_exercises = []
-    exercises.where('expected_difficulty >= 1').find_each do |ex|
+    exercises.where('expected_difficulty >= 1').includes(:tags, exercise_tags: :tag).find_each do |ex|
       ## find exercises which have only tags the user has already seen
       if (ex.tags - tags_user_has_seen).empty?
         potential_recommended_exercises << ex
@@ -119,9 +124,9 @@ def select_best_matching_exercise(user, exercises_user_has_accessed, potential_r
       relative_knowledge_improvement[potex] = 0.0
       Rails.logger.debug { "review potential exercise #{potex.id}" }
       tags.each do |tag|
-        tag_ratio = potex.exercise_tags.find_by(tag:).factor.to_f / potex.exercise_tags.inject(0) do |sum, et|
-                                                                      sum + et.factor
-                                                                    end
+        tag_ratio = potex.exercise_tags.find {|et| et.tag == tag }.factor.to_f / potex.exercise_tags.inject(0) do |sum, et|
+          sum + et.factor
+        end
         max_topic_knowledge_ratio = potex.expected_difficulty * tag_ratio
         old_relative_loss_tag = topic_knowledge_user[tag] / topic_knowledge_max[tag]
         new_relative_loss_tag = topic_knowledge_user[tag] / (topic_knowledge_max[tag] + max_topic_knowledge_ratio)
diff --git a/spec/factories/tag.rb b/spec/factories/tag.rb
@@ -0,0 +1,7 @@
+# frozen_string_literal: true
+
+FactoryBot.define do
+  factory :tag do
+    name { 'tagName' }
+  end
+end
diff --git a/spec/models/proxy_exercise_performance_spec.rb b/spec/models/proxy_exercise_performance_spec.rb
@@ -0,0 +1,175 @@
+# frozen_string_literal: true
+
+require 'rails_helper'
+
+# AI generated to capture performance characteristics of ProxyExercise
+RSpec.describe ProxyExercise, skip: 'Slow tests, activate manually' do
+  # Simple SQL capture helper using ActiveSupport notifications
+  def capture_sql(&)
+    events = []
+    callback = ->(_name, _start, _finish, _id, payload) { events << payload[:sql] }
+    ActiveSupport::Notifications.subscribed(callback, 'sql.active_record', &)
+    events
+  end
+
+  def query_count(&)
+    capture_sql(&).size
+  end
+
+  describe 'performance characteristics' do
+    describe '#count_files' do
+      it 'uses an efficient COUNT query instead of loading all exercises' do
+        pe = create(:proxy_exercise)
+        create_list(:dummy, 20).each {|ex| pe.exercises << ex }
+
+        # Warm-up (avoid counting queries from lazy loading of associations)
+        pe.reload
+
+        sqls = capture_sql { expect(pe.count_files).to eq(20) }
+
+        # We expect a single COUNT(*) style query (plus, in some adapters, an implicit PRAGMA/SHOW if any)
+        # Be tolerant: assert that at least one of the queries contains COUNT and total number is small
+        has_count = sqls.any? {|sql| sql =~ /COUNT\s*\(/i }
+        expect(has_count).to be(true), "Expected a COUNT query, got: #{sqls.inspect}"
+        expect(sqls.size).to be <= 3 # 1 main COUNT + possible noise
+      end
+    end
+
+    describe "#get_matching_exercise when algorithm is 'random'" do
+      let(:user) { create(:learner) }
+
+      it 'does not increase the number of SQL queries with a larger exercise pool' do
+        pe_small = create(:proxy_exercise, algorithm: 'random')
+        chosen_small = create(:dummy)
+        others_small = create_list(:dummy, 9)
+        pe_small.exercises << ([chosen_small] + others_small)
+        allow(pe_small).to receive_message_chain(:exercises, :sample).and_return(chosen_small)
+
+        pe_large = create(:proxy_exercise, algorithm: 'random')
+        chosen_large = create(:dummy)
+        others_large = create_list(:dummy, 99)
+        pe_large.exercises << ([chosen_large] + others_large)
+        allow(pe_large).to receive_message_chain(:exercises, :sample).and_return(chosen_large)
+
+        # Measure queries for small pool
+        small_queries = query_count { pe_small.get_matching_exercise(user) }
+
+        # Measure queries for large pool
+        large_queries = query_count { pe_large.get_matching_exercise(user) }
+
+        # The random algorithm should not need additional queries proportional to the pool size.
+        # Allow a tiny slack for adapter differences and inserts of assignment rows.
+        expect(large_queries - small_queries).to be <= 2,
+          "Expected roughly constant query count, got small=#{small_queries}, large=#{large_queries}"
+      end
+    end
+
+    describe "#get_matching_exercise when algorithm is 'best_match'" do
+      it 'reports current SQL query counts across pool sizes (opt-in via PERF=1)' do
+        ActiveRecord.verbose_query_logs = true
+
+        # Helper to build an exercise with given tags and difficulty
+        def build_tagged_exercise(difficulty:, tags:)
+          ex = create(:dummy, expected_difficulty: difficulty)
+          tags.each do |tag|
+            ExerciseTag.create!(exercise: ex, tag:, factor: 1)
+          end
+          ex
+        end
+
+        # Prepare a fixed tag universe to be stable across runs
+        tags = (1..5).map {|i| Tag.create!(name: "perf-tag-#{i}-#{SecureRandom.hex(2)}") }
+
+        # Build a small history of exercises the user has accessed, covering all tags
+        user = create(:learner)
+        seen_exercises = Array.new(5) {|i| build_tagged_exercise(difficulty: 2, tags: [tags[i]]) }
+        # Create real submissions for the user to reflect accessed exercises (no stubbing)
+        seen_exercises.each do |ex|
+          Submission.create!(exercise: ex, contributor: user, cause: 'submit')
+        end
+
+        def build_pool(size:, tags:)
+          pe = create(:proxy_exercise, algorithm: 'best_match')
+          # Distribute tags and difficulties deterministically
+          exercises = (1..size).map do |i|
+            difficulty = 1 + (i % 3) # 1..3
+            assigned_tags = [tags[i % tags.size], tags[(i + 1) % tags.size]].uniq
+            build_tagged_exercise(difficulty:, tags: assigned_tags)
+          end
+          pe.exercises << exercises
+          pe
+        end
+
+        # Measure across multiple pool sizes
+        sizes = [10, 30, 60]
+        # sizes = [1]
+        results = {}
+        sizes.each do |n|
+          pe = build_pool(size: n, tags:)
+          # Warm up: ensure associations are loaded enough to avoid counting lazy first-time loads unrelated to algorithm
+          pe.reload
+          queries = query_count { pe.get_matching_exercise(user) }
+          results[n] = queries
+        end
+
+        puts "BEST_MATCH query counts by pool size: #{results.inspect}"
+
+        # Very loose sanity checks — this spec primarily reports the current optimization level.
+        expect(results.values.all? {|q| q.positive? }).to be(true)
+      end
+    end
+
+    describe "#get_matching_exercise 'best_match' with varying tag counts" do
+      it 'reports current SQL query counts across different tags per exercise (opt-in via PERF=1)' do
+        # skip('Set PERF=1 to run performance specs') unless ENV['PERF']
+
+        # Helper to build an exercise with given tags and difficulty
+        def build_tagged_exercise(difficulty:, tags:)
+          ex = create(:dummy, expected_difficulty: difficulty)
+          tags.each do |tag|
+            ExerciseTag.create!(exercise: ex, tag: tag, factor: 1)
+          end
+          ex
+        end
+
+        user = create(:learner)
+
+        # Build a small history of exercises the user has accessed, covering some tags
+        base_tags = (1..8).map {|i| Tag.create!(name: "var-tags-base-#{i}-#{SecureRandom.hex(2)}") }
+        seen_exercises = base_tags.first(4).map {|t| build_tagged_exercise(difficulty: 2, tags: [t]) }
+        # Create real submissions for the user to reflect accessed exercises (no stubbing)
+        seen_exercises.each do |ex|
+          Submission.create!(exercise: ex, contributor: user, cause: 'submit')
+        end
+
+        def build_pool_with_tag_count(size:, tag_universe:, tags_per_ex:)
+          pe = create(:proxy_exercise, algorithm: 'best_match')
+          exercises = (1..size).map do |i|
+            difficulty = 1 + (i % 3) # 1..3
+            assigned = (0...tags_per_ex).map {|k| tag_universe[(i + k) % tag_universe.size] }.uniq
+            build_tagged_exercise(difficulty: difficulty, tags: assigned)
+          end
+          pe.exercises << exercises
+          pe
+        end
+
+        # Vary the number of tags per exercise while keeping pool size constant
+        tags_per_exercise = [1, 2, 3, 5]
+        tag_universe = (1..10).map {|i| Tag.create!(name: "var-tags-univ-#{i}-#{SecureRandom.hex(2)}") }
+        results = {}
+        tags_per_exercise.each do |k|
+          pe = build_pool_with_tag_count(size: 30, tag_universe: tag_universe, tags_per_ex: k)
+          # Warm up: ensure associations are loaded enough to avoid counting lazy first-time loads unrelated to algorithm
+          pe.reload
+          queries = query_count { pe.get_matching_exercise(user) }
+          results[k] = queries
+        end
+
+        puts "BEST_MATCH query counts by tags per exercise: #{results.inspect}"
+
+        # Loose sanity check
+        expect(results.values.all? {|q| q.positive? }).to be(true)
+      end
+    end
+  end
+end
diff --git a/spec/models/proxy_exercise_spec.rb b/spec/models/proxy_exercise_spec.rb
@@ -0,0 +1,150 @@
+# frozen_string_literal: true
+
+require 'rails_helper'
+
+RSpec.describe ProxyExercise do
+  describe 'defaults and validations' do
+    let(:pe) { described_class.new }
+
+    it 'sets public to false by default (after_initialize)' do
+      expect(pe.public).to be(false)
+    end
+
+    it 'generates a token if none is present (after_initialize)' do
+      expect(pe.token).to be_present
+      expect(pe.token).to be_a(String)
+      expect(pe.token.size).to eq(8) # SecureRandom.hex(4)
+    end
+
+    context 'with a token' do
+      let(:pe) { build(:proxy_exercise, token: 'fixedtoken') }
+
+      it 'does not overwrite an existing token' do
+        expect(pe.token).to eq('fixedtoken')
+      end
+    end
+  end
+
+  describe '#count_files' do
+    let(:pe) { create(:proxy_exercise, exercises: create_list(:dummy, 7)) }
+
+    it 'returns the number of associated exercises' do
+      expect(pe.count_files).to eq(7)
+    end
+  end
+
+  describe '#duplicate' do
+    subject(:copy) { original.duplicate(title: 'Copy') }
+
+    let(:original) { create(:proxy_exercise, title: 'Original', token: 'tok12345') }
+
+    it 'returns a duplicated instance with overridden attributes' do
+      expect(copy).to be_a(ProxyExercise)
+      expect(copy).not_to be_persisted
+      expect(copy.title).to eq('Copy')
+      expect(copy.token).to eq('tok12345')
+    end
+  end
+
+  describe '#get_matching_exercise' do
+    let(:user) { create(:learner) }
+
+    context 'when an assignment already exists for the user' do
+      subject(:get_matching_exercise) { pe.get_matching_exercise(user) }
+
+      let(:pe) { create(:proxy_exercise) }
+      let(:ex) { create(:dummy) }
+
+      before { UserProxyExerciseExercise.create!(user:, exercise: ex, proxy_exercise: pe) }
+
+      it { is_expected.to eql ex }
+
+      it 'returns the previously assigned exercise without creating another one' do
+        expect { get_matching_exercise }.not_to change(UserProxyExerciseExercise, :count)
+      end
+    end
+
+    context "when algorithm is 'random'" do
+      subject(:get_matching_exercise) { pe.get_matching_exercise(user) }
+
+      let(:pe) { create(:proxy_exercise, algorithm: 'random', exercises: [chosen, other]) }
+      let(:chosen) { create(:dummy) }
+      let(:other) { create(:dummy) }
+
+      before { allow(pe.exercises.target).to receive(:sample).and_return(chosen) }
+
+      it { is_expected.to eql chosen }
+
+      it 'uses exercises.sample and persists the assignment with a reason' do
+        expect { get_matching_exercise }.to change(UserProxyExerciseExercise, :count).by(1)
+
+        assignment = UserProxyExerciseExercise.last
+        reason = JSON.parse(assignment.reason)
+        expect(reason['reason']).to eq('random exercise requested')
+        expect(assignment).to have_attributes(user: user, exercise: chosen, proxy_exercise: pe)
+      end
+    end
+
+    context "when algorithm is 'best_match'" do
+      subject(:get_matching_exercise) { pe.get_matching_exercise(user) }
+
+      let(:pe) { create(:proxy_exercise, algorithm: 'best_match', exercises: [easy, hard]) }
+      let(:easy) { create(:dummy, expected_difficulty: 1) }
+      let(:hard) { create(:dummy, expected_difficulty: 2) }
+
+      context 'when an error occurs' do
+        before do
+          allow(pe).to receive(:find_matching_exercise).and_raise(StandardError, 'boom')
+          allow(pe.exercises.target).to receive(:sample).and_return(hard)
+        end
+
+        it { is_expected.to eql hard }
+
+        it 'falls back to a random exercise with expected_difficulty > 1 and records the error in the reason' do
+          expect { get_matching_exercise }.to change(UserProxyExerciseExercise, :count).by(1)
+
+          reason = JSON.parse(UserProxyExerciseExercise.last.reason)
+          expect(reason['reason']).to eq('fallback because of error')
+          expect(reason['error']).to include('boom')
+        end
+      end
+
+      context 'when the user has not seen any tags' do
+        let(:easy) { create(:dummy, expected_difficulty: 1, tags: [build(:tag, name: 'arrays')]) }
+        let(:hard) { create(:dummy, expected_difficulty: 2, tags: [build(:tag, name: 'hashes')]) }
+
+        it { is_expected.to eql easy }
+
+        it 'selects the easiest exercise and records the reason' do
+          expect { get_matching_exercise }.to change(UserProxyExerciseExercise, :count).by(1)
+
+          reason = JSON.parse(UserProxyExerciseExercise.last.reason)
+          expect(reason['reason']).to eq('easiest exercise in pool. empty potential exercises')
+        end
+      end
+
+      context 'when user has submission for exercise' do
+        let(:tag) { build(:tag, name: 'loops') }
+        let(:hard) { create(:dummy, expected_difficulty: 2, tags: [tag]) }
+
+        before do
+          create(:dummy, expected_difficulty: 1, tags: [tag])
+          create(:dummy, expected_difficulty: 3, tags: [tag])
+          create(:submission, cause: 'submit', exercise: build(:dummy, expected_difficulty: 2, tags: [tag]), contributor: user)
+        end
+
+        it { is_expected.to eql hard }
+
+        it 'chooses the best matching exercise within the difficulty constraint and stores a detailed reason' do
+          expect { get_matching_exercise }.to change(UserProxyExerciseExercise, :count).by(1)
+
+          reason = JSON.parse(UserProxyExerciseExercise.last.reason)
+          expect(reason['reason']).to eq('best matching exercise')
+          expect(reason['highest_difficulty_user_has_accessed']).to eq(2)
+          expect(reason).to have_key('current_users_knowledge_lack')
+          expect(reason).to have_key('relative_knowledge_improvement')
+        end
+      end
+    end
+  end
+end