Skip to content

Commit 7c1d3cc

Browse files
committed
The first stage of vcf_parse_format speed improvements.
This simply turns the monolithic vcf_parse_format functions into a series of numbers sub-functions whose primary is to localise the variables to that code block and to make it easier to see the structure of the tasks being performed. There is no code optimisation here and the main algorithm is unchanged, so this is just moving of code from 1 function to multiple functions. However it makes the next commit easier to understand as we're not trying to see a delta mixed in with restructuring. An unexpected consequence however of making the variables local to their blocks is that it also speeds up the code. The first step in separating this code into functions was simply adding curly braces around each code segment and moving the function-global variables into their respective blocks. The before/after benchmarkjs on 100,000 lines of a multi-sample G1K VCF are ("perf stat" cycle counts): ORIG LOCALISED gcc7 29335942870 27353443704 gcc13 31974757094 31452452908 clang13 31290989382 29508665020 Benchmarked again after moving to actual functions, but the difference was tiny in comparison.)
1 parent 5098983 commit 7c1d3cc

File tree

1 file changed

+114
-32
lines changed

1 file changed

+114
-32
lines changed

vcf.c

Lines changed: 114 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2633,22 +2633,11 @@ static inline int align_mem(kstring_t *s)
26332633
return e == 0 ? 0 : -1;
26342634
}
26352635

2636-
// p,q is the start and the end of the FORMAT field
26372636
#define MAX_N_FMT 255 /* Limited by size of bcf1_t n_fmt field */
2638-
static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q)
2639-
{
2640-
if ( !bcf_hdr_nsamples(h) ) return 0;
2641-
2642-
static int extreme_val_warned = 0;
2643-
char *r, *t;
2644-
int j, l, m, g, overflow = 0;
2645-
khint_t k;
2646-
ks_tokaux_t aux1;
2647-
vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
2648-
kstring_t *mem = (kstring_t*)&h->mem;
2649-
fmt_aux_t fmt[MAX_N_FMT];
2650-
mem->l = 0;
26512637

2638+
// detect FORMAT "."
2639+
static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
2640+
char *p, char *q) {
26522641
char *end = s->s + s->l;
26532642
if ( q>=end )
26542643
{
@@ -2661,10 +2650,19 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
26612650
if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "."
26622651
{
26632652
v->n_sample = bcf_hdr_nsamples(h);
2664-
return 0;
2653+
return 1;
26652654
}
26662655

2667-
// get format information from the dictionary
2656+
return 0;
2657+
}
2658+
2659+
// get format information from the dictionary
2660+
static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
2661+
char *p, char *q, fmt_aux_t *fmt) {
2662+
const vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
2663+
char *t;
2664+
int j;
2665+
ks_tokaux_t aux1;
26682666
for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
26692667
if (j >= MAX_N_FMT) {
26702668
v->errcode |= BCF_ERR_LIMITS;
@@ -2674,7 +2672,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
26742672
}
26752673

26762674
*(char*)aux1.p = 0;
2677-
k = kh_get(vdict, d, t);
2675+
khint_t k = kh_get(vdict, d, t);
26782676
if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
26792677
if ( t[0]=='.' && t[1]==0 )
26802678
{
@@ -2706,10 +2704,17 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
27062704
fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
27072705
v->n_fmt++;
27082706
}
2709-
// compute max
2707+
return 0;
2708+
}
2709+
2710+
// compute max
2711+
static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
2712+
char *p, char *q, fmt_aux_t *fmt) {
27102713
int n_sample_ori = -1;
2711-
r = q + 1; // r: position in the format string
2712-
l = 0, m = g = 1, v->n_sample = 0; // m: max vector size, l: max field len, g: max number of alleles
2714+
char *r = q + 1; // r: position in the format string
2715+
int l = 0, m = 1, g = 1, j;
2716+
v->n_sample = 0; // m: max vector size, l: max field len, g: max number of alleles
2717+
char *end = s->s + s->l;
27132718
while ( r<end )
27142719
{
27152720
// can we skip some samples?
@@ -2767,7 +2772,15 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
27672772
r++;
27682773
}
27692774

2770-
// allocate memory for arrays
2775+
return 0;
2776+
}
2777+
2778+
// allocate memory for arrays
2779+
static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
2780+
char *p, char *q, fmt_aux_t *fmt) {
2781+
kstring_t *mem = (kstring_t*)&h->mem;
2782+
2783+
int j;
27712784
for (j = 0; j < v->n_fmt; ++j) {
27722785
fmt_aux_t *f = &fmt[j];
27732786
if ( !f->max_m ) f->max_m = 1; // omitted trailing format field
@@ -2804,11 +2817,25 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
28042817
}
28052818
mem->l += v->n_sample * f->size;
28062819
}
2807-
for (j = 0; j < v->n_fmt; ++j)
2808-
fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
2809-
// fill the sample fields; at beginning of the loop, t points to the first char of a format
2810-
n_sample_ori = -1;
2811-
t = q + 1; m = 0; // m: sample id
2820+
2821+
{
2822+
int j;
2823+
for (j = 0; j < v->n_fmt; ++j)
2824+
fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
2825+
}
2826+
2827+
return 0;
2828+
}
2829+
2830+
// fill the sample fields; at beginning of the loop
2831+
static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
2832+
char *p, char *q, fmt_aux_t *fmt) {
2833+
static int extreme_val_warned = 0;
2834+
int n_sample_ori = -1;
2835+
// t points to the first char of a format
2836+
char *t = q + 1;
2837+
int m = 0; // m: sample id
2838+
char *end = s->s + s->l;
28122839
while ( t<end )
28132840
{
28142841
// can we skip some samples?
@@ -2824,7 +2851,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
28242851
}
28252852
if ( m == bcf_hdr_nsamples(h) ) break;
28262853

2827-
j = 0; // j-th format field, m-th sample
2854+
int j = 0; // j-th format field, m-th sample
28282855
while ( t < end )
28292856
{
28302857
fmt_aux_t *z = &fmt[j++];
@@ -2835,12 +2862,13 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
28352862
return -1;
28362863
}
28372864
if ((z->y>>4&0xf) == BCF_HT_STR) {
2865+
int l;
28382866
if (z->is_gt) { // genotypes
28392867
int32_t is_phased = 0;
28402868
uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m);
28412869
uint32_t unreadable = 0;
28422870
uint32_t max = 0;
2843-
overflow = 0;
2871+
int overflow = 0;
28442872
for (l = 0;; ++t) {
28452873
if (*t == '.') {
28462874
++t, x[l++] = is_phased;
@@ -2867,16 +2895,17 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
28672895
for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
28682896
} else {
28692897
char *x = (char*)z->buf + z->size * (size_t)m;
2870-
for (r = t, l = 0; *t != ':' && *t; ++t) x[l++] = *t;
2898+
for (l = 0; *t != ':' && *t; ++t) x[l++] = *t;
28712899
for (; l < z->size; ++l) x[l] = 0;
28722900
}
28732901
} else if ((z->y>>4&0xf) == BCF_HT_INT) {
28742902
int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
2903+
int l;
28752904
for (l = 0;; ++t) {
28762905
if (*t == '.') {
28772906
x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
28782907
} else {
2879-
overflow = 0;
2908+
int overflow = 0;
28802909
char *te;
28812910
long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
28822911
if ( te==t || overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
@@ -2897,11 +2926,12 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
28972926
for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
28982927
} else if ((z->y>>4&0xf) == BCF_HT_REAL) {
28992928
float *x = (float*)(z->buf + z->size * (size_t)m);
2929+
int l;
29002930
for (l = 0;; ++t) {
29012931
if (*t == '.' && !isdigit_c(t[1])) {
29022932
bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
29032933
} else {
2904-
overflow = 0;
2934+
int overflow = 0;
29052935
char *te;
29062936
float tmp_val = hts_str2dbl(t, &te, &overflow);
29072937
if ( (te==t || overflow) && !extreme_val_warned )
@@ -2940,6 +2970,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
29402970

29412971
for (; j < v->n_fmt; ++j) { // fill end-of-vector values
29422972
fmt_aux_t *z = &fmt[j];
2973+
int l;
29432974
if ((z->y>>4&0xf) == BCF_HT_STR) {
29442975
if (z->is_gt) {
29452976
int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
@@ -2964,7 +2995,12 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
29642995
m++; t++;
29652996
}
29662997

2967-
// write individual genotype information
2998+
return 0;
2999+
}
3000+
3001+
// write individual genotype information
3002+
static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3003+
char *p, char *q, fmt_aux_t *fmt) {
29683004
kstring_t *str = &v->indiv;
29693005
int i;
29703006
if (v->n_sample > 0) {
@@ -2988,6 +3024,11 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
29883024
}
29893025
}
29903026

3027+
return 0;
3028+
}
3029+
3030+
// validity checking
3031+
static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v) {
29913032
if ( v->n_sample!=bcf_hdr_nsamples(h) )
29923033
{
29933034
hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
@@ -3008,6 +3049,47 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
30083049
return 0;
30093050
}
30103051

3052+
// p,q is the start and the end of the FORMAT field
3053+
static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q)
3054+
{
3055+
if ( !bcf_hdr_nsamples(h) ) return 0;
3056+
kstring_t *mem = (kstring_t*)&h->mem;
3057+
mem->l = 0;
3058+
3059+
fmt_aux_t fmt[MAX_N_FMT];
3060+
3061+
// detect FORMAT "."
3062+
int ret; // +ve = ok, -ve = err
3063+
if ((ret = vcf_parse_format_empty1(s, h, v, p, q)))
3064+
return ret ? 0 : -1;
3065+
3066+
// get format information from the dictionary
3067+
if (vcf_parse_format_dict2(s, h, v, p, q, fmt) < 0)
3068+
return -1;
3069+
3070+
// compute max
3071+
if (vcf_parse_format_max3(s, h, v, p, q, fmt) < 0)
3072+
return -1;
3073+
3074+
// allocate memory for arrays
3075+
if (vcf_parse_format_alloc4(s, h, v, p, q, fmt) < 0)
3076+
return -1;
3077+
3078+
// fill the sample fields; at beginning of the loop
3079+
if (vcf_parse_format_fill5(s, h, v, p, q, fmt) < 0)
3080+
return -1;
3081+
3082+
// write individual genotype information
3083+
if (vcf_parse_format_gt6(s, h, v, p, q, fmt) < 0)
3084+
return -1;
3085+
3086+
// validity checking
3087+
if (vcf_parse_format_check7(h, v) < 0)
3088+
return -1;
3089+
3090+
return 0;
3091+
}
3092+
30113093
static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) {
30123094
// Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
30133095
// been already printed, but will enable tools like vcfcheck to proceed.

0 commit comments

Comments
 (0)