-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathclean_biobank1_data.py
More file actions
executable file
·51 lines (34 loc) · 1.71 KB
/
clean_biobank1_data.py
File metadata and controls
executable file
·51 lines (34 loc) · 1.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/env python3
"""Clean UK Biobank scanner 1 (Cheadle) data.
Subjects from the Assessment Centre from Cheadle (code 11025) are majority white.
Besides, some ages have very low number of subjects (<100). The ethnics minorities
and age with low number are remove from further analysis as well subjects with any
mental or brain disorder.
"""
from pathlib import Path
from utils import load_demographic_data
PROJECT_ROOT = Path.cwd()
def main():
"""Clean UK Biobank scanner 1 data."""
# ----------------------------------------------------------------------------------------
participants_path = PROJECT_ROOT / 'data' / 'BIOBANK' / 'participants.tsv'
ids_path = PROJECT_ROOT / 'data' / 'BIOBANK' / 'freesurferData.csv'
output_ids_filename = 'cleaned_ids.csv'
# ----------------------------------------------------------------------------------------
# Create experiment's output directory
outputs_dir = PROJECT_ROOT / 'outputs'
outputs_dir.mkdir(exist_ok=True)
dataset = load_demographic_data(participants_path, ids_path)
# Exclude subjects outside [47, 73] interval (ages with <100 participants).
dataset = dataset.loc[(dataset['Age'] >= 47) & (dataset['Age'] <= 73)]
# Exclude non-white ethnicities due to small subgroups
dataset = dataset.loc[dataset['Ethnicity'] == 'White']
# Exclude scanner02
dataset = dataset.loc[dataset['Dataset'] == 'BIOBANK-SCANNER01']
# Exclude subjects with previous hospitalization
dataset = dataset.loc[dataset['Diagn'] == 1]
output_ids_df = dataset[['Image_ID']]
assert sum(output_ids_df.duplicated()) == 0
output_ids_df.to_csv(outputs_dir / output_ids_filename, index=False)
if __name__ == "__main__":
main()