Merge pull request #3397 from xiaoha-cloud/topic-modeling-schema-only

sgoggins · web-flow · commit 6b48ab60443d · 2025-11-13T10:03:52.000-06:00
feat: Add Topic Modeling database schema tables
diff --git a/augur/application/db/models/__init__.py b/augur/application/db/models/__init__.py
@@ -39,6 +39,7 @@
     RepoSbomScan,
     RepoStat,
     RepoTopic,
+    TopicModelMeta,
     CommitCommentRef,
     CommitParent,
     DiscourseInsight,
diff --git a/augur/application/db/models/augur_data.py b/augur/application/db/models/augur_data.py
@@ -3601,3 +3601,107 @@ class RepoClone(Base):
     clone_data_timestamp = Column(TIMESTAMP(precision=6))
 
     repo = relationship("Repo")
+
+
+class TopicModelMeta(Base):
+    __tablename__ = "topic_model_meta"
+    __table_args__ = {"schema": "augur_data"}
+
+    model_id = Column(
+        UUID(as_uuid=True),
+        primary_key=True,
+        server_default=text("gen_random_uuid()"),
+        comment="Unique identifier for the topic model"
+    )
+    repo_id = Column(
+        ForeignKey("augur_data.repo.repo_id"),
+        comment="Repository this model was trained on"
+    )
+    model_method = Column(
+        String,
+        nullable=False,
+        comment="Method used for topic modeling (e.g., 'NMF_COUNT', 'LDA_TFIDF')"
+    )
+    num_topics = Column(
+        Integer,
+        nullable=False,
+        comment="Number of topics in the model"
+    )
+    num_words_per_topic = Column(
+        Integer,
+        nullable=False,
+        comment="Number of words per topic"
+    )
+    training_parameters = Column(
+        JSON,
+        nullable=False,
+        comment="JSON object containing training parameters"
+    )
+    model_file_paths = Column(
+        JSON,
+        nullable=False,
+        comment="JSON object containing paths to model artifacts"
+    )
+    parameters_hash = Column(
+        String,
+        nullable=False,
+        comment="Hash of parameters for deduplication"
+    )
+    coherence_score = Column(
+        Float,
+        nullable=False,
+        server_default=text("0.0"),
+        comment="Coherence score of the model"
+    )
+    perplexity_score = Column(
+        Float,
+        nullable=False,
+        server_default=text("0.0"),
+        comment="Perplexity score of the model"
+    )
+    topic_diversity = Column(
+        Float,
+        nullable=False,
+        server_default=text("0.0"),
+        comment="Topic diversity score"
+    )
+    quality = Column(
+        JSON,
+        nullable=False,
+        server_default=text("'{}'::jsonb"),
+        comment="Quality metrics"
+    )
+    training_message_count = Column(
+        BigInteger,
+        nullable=False,
+        comment="Number of messages used for training"
+    )
+    data_fingerprint = Column(
+        JSON,
+        nullable=False,
+        comment="Fingerprint of training data"
+    )
+    visualization_data = Column(
+        JSON,
+        nullable=True,
+        comment="JSON object containing visualization data for the model"
+    )
+    training_start_time = Column(
+        TIMESTAMP(timezone=True),
+        nullable=False,
+        comment="When training started"
+    )
+    training_end_time = Column(
+        TIMESTAMP(timezone=True),
+        nullable=False,
+        comment="When training ended"
+    )
+    tool_source = Column(String, comment="Standard Augur Metadata")
+    tool_version = Column(String, comment="Standard Augur Metadata")
+    data_source = Column(String, comment="Standard Augur Metadata")
+    data_collection_date = Column(
+        TIMESTAMP(timezone=True, precision=0),
+        server_default=text("CURRENT_TIMESTAMP")
+    )
+
+    repo = relationship("Repo")
diff --git a/augur/application/schema/alembic/versions/35_create_topic_model_meta_table.py b/augur/application/schema/alembic/versions/35_create_topic_model_meta_table.py
@@ -0,0 +1,69 @@
+"""Create topic_model_meta table
+
+Revision ID: 35
+Revises: 34
+Create Date: 2024-08-28 20:30:00.000000
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = '35'
+down_revision = '34'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # Create topic_model_meta table based on ER diagram with NOT NULL constraints
+    op.create_table('topic_model_meta',
+        # Primary key
+        sa.Column('model_id', sa.UUID(), server_default=sa.text('gen_random_uuid()'), nullable=False),
+        
+        # Foreign key to repo (nullable for multi-repo training)
+        sa.Column('repo_id', sa.Integer(), nullable=True),
+        
+        # Model metadata (all NOT NULL as requested)
+        sa.Column('model_method', sa.String(), nullable=False),
+        sa.Column('num_topics', sa.Integer(), nullable=False),
+        sa.Column('num_words_per_topic', sa.Integer(), nullable=False),
+        
+        # Parameters and configuration (NOT NULL)
+        sa.Column('training_parameters', postgresql.JSONB(), nullable=False),
+        sa.Column('model_file_paths', postgresql.JSONB(), nullable=False),
+        sa.Column('parameters_hash', sa.String(), nullable=False),
+        
+        # Quality metrics (NOT NULL, but can use default values)
+        sa.Column('coherence_score', sa.Float(), nullable=False, server_default=sa.text('0.0')),
+        sa.Column('perplexity_score', sa.Float(), nullable=False, server_default=sa.text('0.0')),
+        sa.Column('topic_diversity', sa.Float(), nullable=False, server_default=sa.text('0.0')),
+        sa.Column('quality', postgresql.JSONB(), nullable=False, server_default=sa.text("'{}'::jsonb")),
+        
+        # Training metadata (NOT NULL)
+        sa.Column('training_message_count', sa.BigInteger(), nullable=False),
+        sa.Column('data_fingerprint', postgresql.JSONB(), nullable=False),
+        
+        # Visualization data (optional)
+        sa.Column('visualization_data', postgresql.JSONB(), nullable=True),
+        
+        # Timestamps (NOT NULL with defaults)
+        sa.Column('training_start_time', sa.TIMESTAMP(timezone=True), nullable=False),
+        sa.Column('training_end_time', sa.TIMESTAMP(timezone=True), nullable=False),
+        sa.Column('data_collection_date', sa.TIMESTAMP(timezone=True), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
+        
+        # Standard Augur metadata (NOT NULL)
+        sa.Column('tool_source', sa.String(), nullable=False),
+        sa.Column('tool_version', sa.String(), nullable=False),
+        sa.Column('data_source', sa.String(), nullable=False),
+        
+        # Constraints
+        sa.ForeignKeyConstraint(['repo_id'], ['augur_data.repo.repo_id'], ),
+        sa.PrimaryKeyConstraint('model_id'),
+        schema='augur_data'
+    )
+
+
+def downgrade():
+    op.drop_table('topic_model_meta', schema='augur_data') 
diff --git a/augur/application/schema/alembic/versions/36_add_topic_model_event.py b/augur/application/schema/alembic/versions/36_add_topic_model_event.py
@@ -0,0 +1,54 @@
+"""
+Create topic_model_event table for DB event logging
+
+Revision ID: 36
+Revises: 35
+Create Date: 2025-08-21
+"""
+import sqlalchemy as sa
+from alembic import op
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "36"
+down_revision = "35"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    op.create_table(
+        "topic_model_event",
+        sa.Column("event_id", sa.BigInteger(), primary_key=True),
+        sa.Column(
+            "ts",
+            sa.TIMESTAMP(timezone=True),
+            server_default=sa.text("CURRENT_TIMESTAMP"),
+            nullable=False,
+        ),
+        sa.Column("repo_id", sa.Integer(), nullable=True),
+        sa.Column("model_id", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("event", sa.Text(), nullable=False),
+        sa.Column("level", sa.Text(), server_default=sa.text("'INFO'"), nullable=False),
+        sa.Column("payload", postgresql.JSONB(astext_type=sa.Text()), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["repo_id"], ["augur_data.repo.repo_id"], name="fk_tme_repo_id"
+        ),
+        sa.ForeignKeyConstraint(
+            ["model_id"],
+            ["augur_data.topic_model_meta.model_id"],
+            name="fk_tme_model_id",
+            ondelete="SET NULL",
+        ),
+        schema="augur_data",
+    )
+    op.create_index(
+        "ix_tme_repo_ts", "topic_model_event", ["repo_id", "ts"], schema="augur_data"
+    )
+    op.create_index("ix_tme_event", "topic_model_event", ["event"], schema="augur_data")
+
+
+def downgrade():
+    op.drop_index("ix_tme_event", table_name="topic_model_event", schema="augur_data")
+    op.drop_index("ix_tme_repo_ts", table_name="topic_model_event", schema="augur_data")
+    op.drop_table("topic_model_event", schema="augur_data")