-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathbigslice_schema.txt
More file actions
239 lines (239 loc) · 8.64 KB
/
bigslice_schema.txt
File metadata and controls
239 lines (239 loc) · 8.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
CREATE TABLE bgc (
id INTEGER PRIMARY KEY AUTOINCREMENT,
dataset_id INTEGER NOT NULL,
name VARCHAR(250) NOT NULL,
type VARCHAR(10) NOT NULL,
on_contig_edge BOOLEAN,
length_nt INTEGER NOT NULL,
orig_folder VARCHAR(1500) NOT NULL,
orig_filename VARCHAR(1500) NOT NULL,
UNIQUE(orig_folder, orig_filename, dataset_id),
FOREIGN KEY(dataset_id) REFERENCES dataset(id),
FOREIGN KEY(type) REFERENCES enum_bgc_type(code)
);
CREATE TABLE sqlite_sequence(name,seq);
CREATE TABLE bgc_class (
bgc_id INTEGER NOT NULL,
chem_subclass_id INTEGER NOT NULL,
FOREIGN KEY(bgc_id) REFERENCES bgc(id),
FOREIGN KEY(chem_subclass_id) REFERENCES chem_subclass(id)
);
CREATE TABLE bgc_features (
bgc_id INTEGER NOT NULL,
hmm_id INTEGER NOT NULL,
value INTEGER NOT NULL,
UNIQUE(bgc_id, hmm_id),
FOREIGN KEY(bgc_id) REFERENCES bgc(id),
FOREIGN KEY(hmm_id) REFERENCES hmm(id)
);
CREATE TABLE bgc_taxonomy (
bgc_id INTEGER NOT NULL,
taxon_id INTEGER NOT NULL,
FOREIGN KEY(bgc_id) REFERENCES bgc(id),
FOREIGN KEY(taxon_id) REFERENCES taxon(id)
);
CREATE TABLE cds (
id INTEGER PRIMARY KEY AUTOINCREMENT,
bgc_id INTEGER NOT NULL,
nt_start INTEGER NOT NULL,
nt_end INTEGER NOT NULL,
strand INTEGER CHECK(strand IN (-1,0,1)),
locus_tag VARCHAR(100),
protein_id VARCHAR(100),
product VARCHAR(100),
aa_seq TEXT NOT NULL,
FOREIGN KEY(bgc_id) REFERENCES bgc(id)
);
CREATE TABLE chem_class (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name VARCHAR(100) NOT NULL UNIQUE
);
CREATE TABLE chem_subclass (
id INTEGER PRIMARY KEY AUTOINCREMENT,
class_id INTEGER NOT NULL,
name VARCHAR(100) NOT NULL,
FOREIGN KEY(class_id) REFERENCES chem_class(id)
);
CREATE TABLE chem_subclass_map (
class_source VARCHAR(100) NOT NULL,
type_source VARCHAR(10) NOT NULL,
subclass_id INTEGER NOT NULL,
FOREIGN KEY(type_source) REFERENCES enum_bgc_type(code),
FOREIGN KEY(subclass_id) REFERENCES chem_subclass(id)
);
CREATE TABLE clustering (
id INTEGER PRIMARY KEY AUTOINCREMENT,
run_id INTEGER NOT NULL UNIQUE,
clustering_method VARCHAR(100) NOT NULL,
num_centroids INTEGER NOT NULL,
threshold REAL NOT NULL,
random_seed INTEGER NOT NULL,
FOREIGN KEY(run_id) REFERENCES run(id)
);
CREATE TABLE dataset (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name VARCHAR(250) NOT NULL UNIQUE,
orig_folder VARCHAR(250) NOT NULL,
description VARCHAR(2500) NOT NULL
);
CREATE TABLE enum_bgc_type (
code VARCHAR(10) PRIMARY KEY,
description VARCHAR(250)
);
CREATE TABLE enum_run_status (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name VARCHAR(100) NOT NULL UNIQUE
);
CREATE TABLE gcf (
id INTEGER PRIMARY KEY AUTOINCREMENT,
id_in_run INTEGER NOT NULL,
clustering_id INTEGER NOT NULL,
UNIQUE(id_in_run, clustering_id),
FOREIGN KEY(clustering_id) REFERENCES clustering(id)
);
CREATE TABLE gcf_membership (
gcf_id INTEGER NOT NULL,
bgc_id INTEGER NOT NULL,
membership_value REAL NOT NULL,
rank INTEGER NOT NULL,
FOREIGN KEY(gcf_id) REFERENCES gcf(id),
FOREIGN KEY(bgc_id) REFERENCES bgc(id)
);
CREATE TABLE gcf_models (
gcf_id INTEGER NOT NULL,
hmm_id INTEGER NOT NULL,
value REAL NOT NULL,
UNIQUE(gcf_id, hmm_id),
FOREIGN KEY(gcf_id) REFERENCES gcf(id),
FOREIGN KEY(hmm_id) REFERENCES hmm(id)
);
CREATE TABLE hmm (
id INTEGER PRIMARY KEY AUTOINCREMENT,
accession VARCHAR(10),
name VARCHAR(25) NOT NULL,
db_id INTEGER NOT NULL,
model_length INTEGER NOT NULL,
FOREIGN KEY(db_id) REFERENCES hmm_db(id)
);
CREATE TABLE hmm_db (
id INTEGER PRIMARY KEY AUTOINCREMENT,
md5_biosyn_pfam CHAR(32) NOT NULL,
md5_sub_pfam CHAR(32) NOT NULL
);
CREATE TABLE hsp (
id INTEGER PRIMARY KEY AUTOINCREMENT,
cds_id INTEGER NOT NULL,
hmm_id INTEGER NOT NULL,
bitscore REAL NOT NULL,
FOREIGN KEY(cds_id) REFERENCES cds(id),
FOREIGN KEY(hmm_id) REFERENCES hmm(id)
);
CREATE TABLE hsp_alignment (
hsp_id INTEGER UNIQUE NOT NULL,
model_start INTEGER NOT NULL,
model_end INTEGER NOT NULL,
model_gaps TEXT NOT NULL,
cds_start INTEGER NOT NULL,
cds_end INTEGER NOT NULL,
cds_gaps TEXT NOT NULL,
FOREIGN KEY(hsp_id) REFERENCES hsp(id)
);
CREATE TABLE hsp_subpfam (
hsp_subpfam_id INTEGER NOT NULL,
hsp_parent_id INTEGER NOT NULL,
UNIQUE(hsp_subpfam_id, hsp_parent_id),
FOREIGN KEY(hsp_subpfam_id) REFERENCES hsp(id),
FOREIGN KEY(hsp_parent_id) REFERENCES hsp(id)
);
CREATE TABLE run (
id INTEGER PRIMARY KEY AUTOINCREMENT,
status INTEGER NOT NULL,
prog_params VARCHAR(250) NOT NULL,
hmm_db_id INTEGER,
FOREIGN KEY(status) REFERENCES enum_run_status(id),
FOREIGN KEY(hmm_db_id) REFERENCES hmm_db(id)
);
CREATE TABLE run_bgc_status (
bgc_id INTEGER NOT NULL,
run_id INTEGER NOT NULL,
status INTEGER NOT NULL,
FOREIGN KEY(bgc_id) REFERENCES bgc(id),
FOREIGN KEY(run_id) REFERENCES run(id),
FOREIGN KEY(status) REFERENCES enum_run_status(id)
);
CREATE TABLE run_log (
run_id INTEGER NOT NULL,
time_stamp DATETIME NOT NULL,
message VARCHAR(500) NOT NULL
);
CREATE TABLE schema (
ver VARCHAR(10) PRIMARY KEY
);
CREATE TABLE subpfam (
hmm_id INTEGER NOT NULL,
parent_hmm_id INTEGER NOT NULL,
FOREIGN KEY(hmm_id) REFERENCES hmm(id),
FOREIGN KEY(parent_hmm_id) REFERENCES hmm(id)
);
CREATE TABLE taxon (
id INTEGER PRIMARY KEY AUTOINCREMENT,
level INTEGER NOT NULL,
name VARCHAR(100) NOT NULL,
UNIQUE(name, level),
FOREIGN KEY(level) REFERENCES taxon_class(level)
);
CREATE TABLE taxon_class (
id INTEGER PRIMARY KEY AUTOINCREMENT,
level INTEGER NOT NULL UNIQUE,
name VARCHAR(100) NOT NULL UNIQUE
);
CREATE INDEX dataset_name ON dataset(name);
CREATE INDEX bgc_dataset ON bgc(dataset_id);
CREATE INDEX bgc_name ON bgc(name);
CREATE INDEX bgc_type ON bgc(type);
CREATE INDEX bgc_gbkpath ON bgc(orig_folder, orig_filename);
CREATE INDEX bgc_filename ON bgc(orig_filename);
CREATE INDEX bgc_contigedge ON bgc(on_contig_edge);
CREATE INDEX bgc_length ON bgc(length_nt);
CREATE INDEX cds_bgc ON cds(bgc_id,nt_start,nt_end);
CREATE INDEX hmm_acc ON hmm(db_id, accession);
CREATE INDEX hmm_name ON hmm(db_id, name);
CREATE INDEX subpfam_parenthmm ON subpfam(parent_hmm_id, hmm_id);
CREATE INDEX hsp_cdshmm ON hsp(cds_id, hmm_id);
CREATE INDEX hsp_bitscore ON hsp(bitscore);
CREATE INDEX hspalign_id ON hsp_alignment(hsp_id);
CREATE INDEX hspalign_model ON hsp_alignment(model_start);
CREATE INDEX hspalign_cds ON hsp_alignment(cds_start);
CREATE INDEX hspsubpfam_parent ON hsp_subpfam(hsp_parent_id, hsp_subpfam_id);
CREATE INDEX hspsubpfam_sub ON hsp_subpfam(hsp_subpfam_id, hsp_parent_id);
CREATE UNIQUE INDEX taxon_class_level ON taxon_class(level);
CREATE UNIQUE INDEX taxon_class_name ON taxon_class(name);
CREATE INDEX taxon_level ON taxon(level, name);
CREATE INDEX taxon_name ON taxon(name, level);
CREATE INDEX bgctaxonomy_bgcid ON bgc_taxonomy(bgc_id);
CREATE INDEX bgctaxonomy_taxid ON bgc_taxonomy(taxon_id);
CREATE UNIQUE INDEX chemclass_name ON chem_class(name);
CREATE INDEX chemsubclass_name ON chem_subclass(name, class_id);
CREATE INDEX chemsubclass_class ON chem_subclass(class_id, name);
CREATE INDEX chemsubclassmap_source ON chem_subclass_map(type_source, class_source);
CREATE INDEX bgcclass_chemsubclass ON bgc_class(chem_subclass_id, bgc_id);
CREATE INDEX bgcclass_bgc ON bgc_class(bgc_id, chem_subclass_id);
CREATE UNIQUE INDEX enumrunstatus_name ON enum_run_status(name);
CREATE INDEX run_hmmdb ON run(hmm_db_id, status);
CREATE INDEX runlog_run ON run_log(run_id, time_stamp);
CREATE INDEX runbgcstatus_run_status ON run_bgc_status(run_id, status, bgc_id);
CREATE INDEX runbgcstatus_run_bgc ON run_bgc_status(run_id, bgc_id, status);
CREATE INDEX bgc_features_bgc ON bgc_features(bgc_id, hmm_id, value);
CREATE INDEX bgc_features_bgc_value ON bgc_features(value, bgc_id, hmm_id);
CREATE INDEX bgc_features_hmm ON bgc_features(hmm_id, bgc_id, value);
CREATE INDEX bgc_features_hmm_value ON bgc_features(value, hmm_id, bgc_id);
CREATE INDEX clustering_run ON clustering(run_id);
CREATE INDEX gcf_clustering ON gcf(clustering_id, id_in_run);
CREATE INDEX gcf_models_gcf ON gcf_models(gcf_id, hmm_id, value);
CREATE INDEX gcf_models_gcf_value ON gcf_models(value, gcf_id, hmm_id);
CREATE INDEX gcf_models_hmm ON gcf_models(hmm_id, gcf_id);
CREATE INDEX gcf_models_hmm_value ON gcf_models(value, hmm_id, gcf_id);
CREATE INDEX gcf_membership_gcf_rank ON gcf_membership(gcf_id, rank);
CREATE INDEX gcf_membership_gcf_val ON gcf_membership(gcf_id, membership_value);
CREATE INDEX gcf_membership_bgc_rank ON gcf_membership(bgc_id, rank);
CREATE INDEX gcf_membership_bgc_val ON gcf_membership(bgc_id, membership_value);