Skip to content

Commit 6b48ab6

Browse files
authored
Merge pull request #3397 from xiaoha-cloud/topic-modeling-schema-only
feat: Add Topic Modeling database schema tables
2 parents 5248b07 + aa67f9b commit 6b48ab6

File tree

4 files changed

+228
-0
lines changed

4 files changed

+228
-0
lines changed

augur/application/db/models/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
RepoSbomScan,
4040
RepoStat,
4141
RepoTopic,
42+
TopicModelMeta,
4243
CommitCommentRef,
4344
CommitParent,
4445
DiscourseInsight,

augur/application/db/models/augur_data.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3601,3 +3601,107 @@ class RepoClone(Base):
36013601
clone_data_timestamp = Column(TIMESTAMP(precision=6))
36023602

36033603
repo = relationship("Repo")
3604+
3605+
3606+
class TopicModelMeta(Base):
3607+
__tablename__ = "topic_model_meta"
3608+
__table_args__ = {"schema": "augur_data"}
3609+
3610+
model_id = Column(
3611+
UUID(as_uuid=True),
3612+
primary_key=True,
3613+
server_default=text("gen_random_uuid()"),
3614+
comment="Unique identifier for the topic model"
3615+
)
3616+
repo_id = Column(
3617+
ForeignKey("augur_data.repo.repo_id"),
3618+
comment="Repository this model was trained on"
3619+
)
3620+
model_method = Column(
3621+
String,
3622+
nullable=False,
3623+
comment="Method used for topic modeling (e.g., 'NMF_COUNT', 'LDA_TFIDF')"
3624+
)
3625+
num_topics = Column(
3626+
Integer,
3627+
nullable=False,
3628+
comment="Number of topics in the model"
3629+
)
3630+
num_words_per_topic = Column(
3631+
Integer,
3632+
nullable=False,
3633+
comment="Number of words per topic"
3634+
)
3635+
training_parameters = Column(
3636+
JSON,
3637+
nullable=False,
3638+
comment="JSON object containing training parameters"
3639+
)
3640+
model_file_paths = Column(
3641+
JSON,
3642+
nullable=False,
3643+
comment="JSON object containing paths to model artifacts"
3644+
)
3645+
parameters_hash = Column(
3646+
String,
3647+
nullable=False,
3648+
comment="Hash of parameters for deduplication"
3649+
)
3650+
coherence_score = Column(
3651+
Float,
3652+
nullable=False,
3653+
server_default=text("0.0"),
3654+
comment="Coherence score of the model"
3655+
)
3656+
perplexity_score = Column(
3657+
Float,
3658+
nullable=False,
3659+
server_default=text("0.0"),
3660+
comment="Perplexity score of the model"
3661+
)
3662+
topic_diversity = Column(
3663+
Float,
3664+
nullable=False,
3665+
server_default=text("0.0"),
3666+
comment="Topic diversity score"
3667+
)
3668+
quality = Column(
3669+
JSON,
3670+
nullable=False,
3671+
server_default=text("'{}'::jsonb"),
3672+
comment="Quality metrics"
3673+
)
3674+
training_message_count = Column(
3675+
BigInteger,
3676+
nullable=False,
3677+
comment="Number of messages used for training"
3678+
)
3679+
data_fingerprint = Column(
3680+
JSON,
3681+
nullable=False,
3682+
comment="Fingerprint of training data"
3683+
)
3684+
visualization_data = Column(
3685+
JSON,
3686+
nullable=True,
3687+
comment="JSON object containing visualization data for the model"
3688+
)
3689+
training_start_time = Column(
3690+
TIMESTAMP(timezone=True),
3691+
nullable=False,
3692+
comment="When training started"
3693+
)
3694+
training_end_time = Column(
3695+
TIMESTAMP(timezone=True),
3696+
nullable=False,
3697+
comment="When training ended"
3698+
)
3699+
tool_source = Column(String, comment="Standard Augur Metadata")
3700+
tool_version = Column(String, comment="Standard Augur Metadata")
3701+
data_source = Column(String, comment="Standard Augur Metadata")
3702+
data_collection_date = Column(
3703+
TIMESTAMP(timezone=True, precision=0),
3704+
server_default=text("CURRENT_TIMESTAMP")
3705+
)
3706+
3707+
repo = relationship("Repo")
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
"""Create topic_model_meta table
2+
3+
Revision ID: 35
4+
Revises: 34
5+
Create Date: 2024-08-28 20:30:00.000000
6+
7+
"""
8+
from alembic import op
9+
import sqlalchemy as sa
10+
from sqlalchemy.dialects import postgresql
11+
12+
# revision identifiers, used by Alembic.
13+
revision = '35'
14+
down_revision = '34'
15+
branch_labels = None
16+
depends_on = None
17+
18+
19+
def upgrade():
20+
# Create topic_model_meta table based on ER diagram with NOT NULL constraints
21+
op.create_table('topic_model_meta',
22+
# Primary key
23+
sa.Column('model_id', sa.UUID(), server_default=sa.text('gen_random_uuid()'), nullable=False),
24+
25+
# Foreign key to repo (nullable for multi-repo training)
26+
sa.Column('repo_id', sa.Integer(), nullable=True),
27+
28+
# Model metadata (all NOT NULL as requested)
29+
sa.Column('model_method', sa.String(), nullable=False),
30+
sa.Column('num_topics', sa.Integer(), nullable=False),
31+
sa.Column('num_words_per_topic', sa.Integer(), nullable=False),
32+
33+
# Parameters and configuration (NOT NULL)
34+
sa.Column('training_parameters', postgresql.JSONB(), nullable=False),
35+
sa.Column('model_file_paths', postgresql.JSONB(), nullable=False),
36+
sa.Column('parameters_hash', sa.String(), nullable=False),
37+
38+
# Quality metrics (NOT NULL, but can use default values)
39+
sa.Column('coherence_score', sa.Float(), nullable=False, server_default=sa.text('0.0')),
40+
sa.Column('perplexity_score', sa.Float(), nullable=False, server_default=sa.text('0.0')),
41+
sa.Column('topic_diversity', sa.Float(), nullable=False, server_default=sa.text('0.0')),
42+
sa.Column('quality', postgresql.JSONB(), nullable=False, server_default=sa.text("'{}'::jsonb")),
43+
44+
# Training metadata (NOT NULL)
45+
sa.Column('training_message_count', sa.BigInteger(), nullable=False),
46+
sa.Column('data_fingerprint', postgresql.JSONB(), nullable=False),
47+
48+
# Visualization data (optional)
49+
sa.Column('visualization_data', postgresql.JSONB(), nullable=True),
50+
51+
# Timestamps (NOT NULL with defaults)
52+
sa.Column('training_start_time', sa.TIMESTAMP(timezone=True), nullable=False),
53+
sa.Column('training_end_time', sa.TIMESTAMP(timezone=True), nullable=False),
54+
sa.Column('data_collection_date', sa.TIMESTAMP(timezone=True), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
55+
56+
# Standard Augur metadata (NOT NULL)
57+
sa.Column('tool_source', sa.String(), nullable=False),
58+
sa.Column('tool_version', sa.String(), nullable=False),
59+
sa.Column('data_source', sa.String(), nullable=False),
60+
61+
# Constraints
62+
sa.ForeignKeyConstraint(['repo_id'], ['augur_data.repo.repo_id'], ),
63+
sa.PrimaryKeyConstraint('model_id'),
64+
schema='augur_data'
65+
)
66+
67+
68+
def downgrade():
69+
op.drop_table('topic_model_meta', schema='augur_data')
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
"""
2+
Create topic_model_event table for DB event logging
3+
4+
Revision ID: 36
5+
Revises: 35
6+
Create Date: 2025-08-21
7+
"""
8+
import sqlalchemy as sa
9+
from alembic import op
10+
from sqlalchemy.dialects import postgresql
11+
12+
# revision identifiers, used by Alembic.
13+
revision = "36"
14+
down_revision = "35"
15+
branch_labels = None
16+
depends_on = None
17+
18+
19+
def upgrade():
20+
op.create_table(
21+
"topic_model_event",
22+
sa.Column("event_id", sa.BigInteger(), primary_key=True),
23+
sa.Column(
24+
"ts",
25+
sa.TIMESTAMP(timezone=True),
26+
server_default=sa.text("CURRENT_TIMESTAMP"),
27+
nullable=False,
28+
),
29+
sa.Column("repo_id", sa.Integer(), nullable=True),
30+
sa.Column("model_id", postgresql.UUID(as_uuid=True), nullable=True),
31+
sa.Column("event", sa.Text(), nullable=False),
32+
sa.Column("level", sa.Text(), server_default=sa.text("'INFO'"), nullable=False),
33+
sa.Column("payload", postgresql.JSONB(astext_type=sa.Text()), nullable=False),
34+
sa.ForeignKeyConstraint(
35+
["repo_id"], ["augur_data.repo.repo_id"], name="fk_tme_repo_id"
36+
),
37+
sa.ForeignKeyConstraint(
38+
["model_id"],
39+
["augur_data.topic_model_meta.model_id"],
40+
name="fk_tme_model_id",
41+
ondelete="SET NULL",
42+
),
43+
schema="augur_data",
44+
)
45+
op.create_index(
46+
"ix_tme_repo_ts", "topic_model_event", ["repo_id", "ts"], schema="augur_data"
47+
)
48+
op.create_index("ix_tme_event", "topic_model_event", ["event"], schema="augur_data")
49+
50+
51+
def downgrade():
52+
op.drop_index("ix_tme_event", table_name="topic_model_event", schema="augur_data")
53+
op.drop_index("ix_tme_repo_ts", table_name="topic_model_event", schema="augur_data")
54+
op.drop_table("topic_model_event", schema="augur_data")

0 commit comments

Comments
 (0)