Skip to content

Commit 01c6811

Browse files
authored
Merge pull request #5 from Xinglab/dev
Dev to v1.2
2 parents b09c5b2 + bdb2321 commit 01c6811

File tree

12 files changed

+529
-94
lines changed

12 files changed

+529
-94
lines changed

CLAM/config.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# -*- coding: UTF-8 -*-
2+
3+
""" General Version and other info
4+
"""
5+
6+
__version__ = '1.2.0'
7+
__author__ = 'Zijun Zhang'
8+
__email__ = 'zj.z@ucla.edu'

CLAM/download_data.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import os
2+
import sys
3+
import subprocess
4+
5+
6+
def parser(args):
7+
"""DOCSTRING
8+
Args
9+
Returns
10+
"""
11+
try:
12+
genome = args.genome
13+
download_genome(genome)
14+
except KeyboardInterrupt():
15+
sys.exit(0)
16+
17+
18+
def download_genome(genome):
19+
curr_dir = os.path.abspath('.')
20+
21+
admin = (os.getuid() == 0)
22+
cmd = []
23+
home = os.environ['HOME']
24+
if admin:
25+
profile = '/etc/profile'
26+
else:
27+
profile = '{home}/.bashrc'.format(home=home)
28+
29+
if not os.path.isdir('{home}/.clam_data'.format(home=home)):
30+
os.mkdir('{home}/.clam_data'.format(home=home))
31+
os.chdir('{home}/.clam_data'.format(home=home))
32+
33+
if 'CLAM_DAT' not in os.environ or not os.environ['CLAM_DAT'] == '{home}/.clam_data'.format(home=home):
34+
cmd.append('echo "export CLAM_DAT=\'{clam_data}\'" >> {profile}'.format(
35+
clam_data=os.path.abspath('.'), profile=profile))
36+
cmd.append('source {profile}'.format(profile=profile))
37+
os.environ['CLAM_DAT'] = os.path.abspath('.')
38+
39+
if not check_genome_data(genome):
40+
cmd.append('chmod -R 755 {home}/.clam_data'.format(home=home))
41+
cmd.append(
42+
'wget https://raw.githubusercontent.com/wkdeng/clam_data/master/{genome}.zip'.format(genome=genome))
43+
cmd.append('unzip -o {genome}.zip'.format(genome=genome))
44+
cmd.append('rm {genome}.zip'.format(genome=genome))
45+
for item in cmd:
46+
subprocess.call(item, shell=True, executable='/bin/bash')
47+
print('Download finished')
48+
os.chdir(curr_dir)
49+
50+
def check_genome_data(genome):
51+
if not os.path.isdir(os.environ['CLAM_DAT'] + '/' + genome):
52+
return False
53+
if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/3UTRs.bed'):
54+
return False
55+
if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/5UTRs.bed'):
56+
return False
57+
if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/cds.bed'):
58+
return False
59+
if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/exons.bed'):
60+
return False
61+
if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/introns.bed'):
62+
return False
63+
if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/proximal200_intron.bed'):
64+
return False
65+
if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/proximal500_intron.bed'):
66+
return False
67+
return True
68+
69+
if __name__ == '__main__':
70+
download_genome('hg38')

CLAM/peak_annotator.py

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
import sys
2+
import os
3+
import pybedtools
4+
import argparse as ap
5+
import logging
6+
from . import download_data, config
7+
8+
'''
9+
Assign peaks to genomic regions
10+
Zijun Zhang
11+
8.1.2018
12+
10.25.2018: wrapped to a function with document
13+
14+
DWK
15+
modified to output annotation file
16+
6.12.2019
17+
'''
18+
19+
# pylint: disable-msg=too-many-function-args
20+
# pylint: disable-msg=unexpected-keyword-arg
21+
22+
23+
def parser(args):
24+
"""DOCSTRING
25+
Args
26+
Returns
27+
"""
28+
try:
29+
peak_in = args.peak_in
30+
genome = args.genome
31+
out_file = args.out_file
32+
if 'CLAM_DAT' not in os.environ or not download_data.check_genome_data(genome):
33+
print("Unable to locate CLAM data folder for genomic regions, will try to download.")
34+
print("Downloading...")
35+
download_data.download_genome(genome)
36+
genome_data = os.environ['CLAM_DAT']
37+
intersect_gtf_regions(
38+
peak_in, out_file, os.path.join(genome_data, genome))
39+
except KeyboardInterrupt():
40+
sys.exit(0)
41+
42+
43+
def intersect_gtf_regions(peak_fp, outfn, gtf_dir):
44+
'''function: intersect_gtf_regions(peak_fp, outfn, gtf_dir)
45+
Intersect a peak BED file with a list of genomic region annotations (e.g. start/stop codon, UTR, intron),
46+
output the peak-region annotations.
47+
:param peak_fp: filepath to a BED-format peakquit
48+
:param outfn: filepath to output count file, has to end with ".txt"; annotation will be "NNN.annot.txt"
49+
50+
'''
51+
# input arguments
52+
53+
# make pybedtools objects
54+
print("Loading peaks...")
55+
peaks = pybedtools.BedTool(peak_fp)
56+
print("Peak file loaded.")
57+
print("Loading genome annotation...")
58+
ref_dict = {
59+
'exon': pybedtools.BedTool(os.path.join(gtf_dir, 'exons.bed')),
60+
'3UTR': pybedtools.BedTool(os.path.join(gtf_dir, '3UTRs.bed')),
61+
'5UTR': pybedtools.BedTool(os.path.join(gtf_dir, '5UTRs.bed')),
62+
'cds': pybedtools.BedTool(os.path.join(gtf_dir, 'cds.bed')),
63+
'intron': pybedtools.BedTool(os.path.join(gtf_dir, 'introns.bed')),
64+
'proximal200': pybedtools.BedTool(os.path.join(gtf_dir, 'proximal200_intron.bed')),
65+
'proximal500': pybedtools.BedTool(os.path.join(gtf_dir, 'proximal500_intron.bed'))
66+
}
67+
print("Genome annotation loaded.")
68+
69+
# # process reference for use
70+
target = {
71+
"3UTR": ref_dict['3UTR'],
72+
"5UTR": ref_dict['5UTR'],
73+
"CDS": ref_dict['cds'],
74+
"other_exon": ref_dict['exon']-ref_dict['3UTR']-ref_dict['5UTR']-ref_dict['cds'],
75+
"px200_intron": ref_dict['proximal200'],
76+
"px500_intron": ref_dict['proximal500'].subtract(ref_dict['proximal200']),
77+
"distal_intron": ref_dict['intron'].subtract(ref_dict['exon']).subtract(ref_dict['proximal500'])
78+
}
79+
category_list = ['3UTR', '5UTR', 'CDS',
80+
'other_exon', "px200_intron", "px500_intron", "distal_intron"]
81+
init = True
82+
83+
print("Intersecting peaks with genome annotation...")
84+
for cat in category_list:
85+
bed_arr = []
86+
for interval in target[cat]:
87+
bed_arr.append('\t'.join([str(x) for x in interval.fields]))
88+
bed_arr[-1] = bed_arr[-1] + '\t' + cat
89+
bed_arr = list(dict.fromkeys(bed_arr))
90+
for i in range(len(bed_arr)):
91+
bed_arr[i] = bed_arr[i].split('\t')
92+
target[cat] = pybedtools.BedTool(bed_arr)
93+
94+
if init:
95+
init = False
96+
result_bed = peaks.intersect(target[cat], wa=True, wb=True)
97+
else:
98+
result_bed = result_bed.cat(peaks.intersect(
99+
target[cat], wa=True, wb=True), postmerge=False)
100+
result_bed = result_bed.sort()
101+
102+
print("Preparing output...")
103+
result_bed.saveas(outfn + '_')
104+
prepend = ['## Annotation peaks to genomic regions, all intersected genomic regions are presented.',
105+
'## CLAM version: %s'%config.__version__,
106+
'## Column 1: Peak chromosome',
107+
'## Column 2: Peak start',
108+
'## Column 3: Peak end',
109+
'## Column 4: Peak name',
110+
'## Column 5: Peak score',
111+
'## Column 6: Peak strand',
112+
'## Column 7: Peak signal value',
113+
'## Column 8: Peak pValue',
114+
'## Column 9: Peak qValue',
115+
'## Column 10: Point-source called for this peak',
116+
'## Column 11: Genomic region chromosome',
117+
'## Column 12: Genomic region start',
118+
'## Column 13: Genomic region end',
119+
'## Column 14: Gene ID',
120+
'## Column 15: Quality score',
121+
'## Column 16: Genomic region strand',
122+
'## Column 17: Genomic region type']
123+
if os.path.exists(outfn):
124+
os.remove(outfn)
125+
for line in prepend:
126+
cmd = 'echo "{prepend}" >> {outfn}'.format(
127+
prepend=line, outfn=outfn)
128+
os.system(cmd)
129+
os.system('cat {outtmp} >> {outfn}'.format(
130+
outtmp=outfn + '_', outfn=outfn))
131+
os.remove(outfn+'_')
132+
print("DONE")
133+
134+
135+
if __name__ == '__main__':
136+
# peak_fp, genome, outfn = sys.argv[1], sys.argv[2], sys.argv[3]
137+
os.chdir('/mnt/h/yi_lab/m6a/src/scripts/peakComposition')
138+
peak_in, genome, out_file = 'narrow_peak.unique.bed', 'mm10', 'annotate_peak.bed'
139+
if 'CLAM_DAT' not in os.environ or not download_data.check_genome_data(genome):
140+
print("Unable to find CLAM data folder for genomic regions, please try to download it using download_genome command.")
141+
print("Downloading...")
142+
download_data.download_genome(genome)
143+
genome_data = os.environ['CLAM_DAT']
144+
intersect_gtf_regions(
145+
peak_in, out_file, os.path.join(genome_data, genome))

0 commit comments

Comments
 (0)