@@ -2633,22 +2633,11 @@ static inline int align_mem(kstring_t *s)
26332633 return e == 0 ? 0 : -1 ;
26342634}
26352635
2636- // p,q is the start and the end of the FORMAT field
26372636#define MAX_N_FMT 255 /* Limited by size of bcf1_t n_fmt field */
2638- static int vcf_parse_format (kstring_t * s , const bcf_hdr_t * h , bcf1_t * v , char * p , char * q )
2639- {
2640- if ( !bcf_hdr_nsamples (h ) ) return 0 ;
2641-
2642- static int extreme_val_warned = 0 ;
2643- char * r , * t ;
2644- int j , l , m , g , overflow = 0 ;
2645- khint_t k ;
2646- ks_tokaux_t aux1 ;
2647- vdict_t * d = (vdict_t * )h -> dict [BCF_DT_ID ];
2648- kstring_t * mem = (kstring_t * )& h -> mem ;
2649- fmt_aux_t fmt [MAX_N_FMT ];
2650- mem -> l = 0 ;
26512637
2638+ // detect FORMAT "."
2639+ static int vcf_parse_format_empty1 (kstring_t * s , const bcf_hdr_t * h , bcf1_t * v ,
2640+ char * p , char * q ) {
26522641 char * end = s -> s + s -> l ;
26532642 if ( q >=end )
26542643 {
@@ -2661,10 +2650,19 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
26612650 if ( p [0 ]== '.' && p [1 ]== 0 ) // FORMAT field is empty "."
26622651 {
26632652 v -> n_sample = bcf_hdr_nsamples (h );
2664- return 0 ;
2653+ return 1 ;
26652654 }
26662655
2667- // get format information from the dictionary
2656+ return 0 ;
2657+ }
2658+
2659+ // get format information from the dictionary
2660+ static int vcf_parse_format_dict2 (kstring_t * s , const bcf_hdr_t * h , bcf1_t * v ,
2661+ char * p , char * q , fmt_aux_t * fmt ) {
2662+ const vdict_t * d = (vdict_t * )h -> dict [BCF_DT_ID ];
2663+ char * t ;
2664+ int j ;
2665+ ks_tokaux_t aux1 ;
26682666 for (j = 0 , t = kstrtok (p , ":" , & aux1 ); t ; t = kstrtok (0 , 0 , & aux1 ), ++ j ) {
26692667 if (j >= MAX_N_FMT ) {
26702668 v -> errcode |= BCF_ERR_LIMITS ;
@@ -2674,7 +2672,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
26742672 }
26752673
26762674 * (char * )aux1 .p = 0 ;
2677- k = kh_get (vdict , d , t );
2675+ khint_t k = kh_get (vdict , d , t );
26782676 if (k == kh_end (d ) || kh_val (d , k ).info [BCF_HL_FMT ] == 15 ) {
26792677 if ( t [0 ]== '.' && t [1 ]== 0 )
26802678 {
@@ -2706,10 +2704,17 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
27062704 fmt [j ].y = h -> id [0 ][fmt [j ].key ].val -> info [BCF_HL_FMT ];
27072705 v -> n_fmt ++ ;
27082706 }
2709- // compute max
2707+ return 0 ;
2708+ }
2709+
2710+ // compute max
2711+ static int vcf_parse_format_max3 (kstring_t * s , const bcf_hdr_t * h , bcf1_t * v ,
2712+ char * p , char * q , fmt_aux_t * fmt ) {
27102713 int n_sample_ori = -1 ;
2711- r = q + 1 ; // r: position in the format string
2712- l = 0 , m = g = 1 , v -> n_sample = 0 ; // m: max vector size, l: max field len, g: max number of alleles
2714+ char * r = q + 1 ; // r: position in the format string
2715+ int l = 0 , m = 1 , g = 1 , j ;
2716+ v -> n_sample = 0 ; // m: max vector size, l: max field len, g: max number of alleles
2717+ char * end = s -> s + s -> l ;
27132718 while ( r < end )
27142719 {
27152720 // can we skip some samples?
@@ -2767,7 +2772,15 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
27672772 r ++ ;
27682773 }
27692774
2770- // allocate memory for arrays
2775+ return 0 ;
2776+ }
2777+
2778+ // allocate memory for arrays
2779+ static int vcf_parse_format_alloc4 (kstring_t * s , const bcf_hdr_t * h , bcf1_t * v ,
2780+ char * p , char * q , fmt_aux_t * fmt ) {
2781+ kstring_t * mem = (kstring_t * )& h -> mem ;
2782+
2783+ int j ;
27712784 for (j = 0 ; j < v -> n_fmt ; ++ j ) {
27722785 fmt_aux_t * f = & fmt [j ];
27732786 if ( !f -> max_m ) f -> max_m = 1 ; // omitted trailing format field
@@ -2804,11 +2817,25 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
28042817 }
28052818 mem -> l += v -> n_sample * f -> size ;
28062819 }
2807- for (j = 0 ; j < v -> n_fmt ; ++ j )
2808- fmt [j ].buf = (uint8_t * )mem -> s + fmt [j ].offset ;
2809- // fill the sample fields; at beginning of the loop, t points to the first char of a format
2810- n_sample_ori = -1 ;
2811- t = q + 1 ; m = 0 ; // m: sample id
2820+
2821+ {
2822+ int j ;
2823+ for (j = 0 ; j < v -> n_fmt ; ++ j )
2824+ fmt [j ].buf = (uint8_t * )mem -> s + fmt [j ].offset ;
2825+ }
2826+
2827+ return 0 ;
2828+ }
2829+
2830+ // fill the sample fields; at beginning of the loop
2831+ static int vcf_parse_format_fill5 (kstring_t * s , const bcf_hdr_t * h , bcf1_t * v ,
2832+ char * p , char * q , fmt_aux_t * fmt ) {
2833+ static int extreme_val_warned = 0 ;
2834+ int n_sample_ori = -1 ;
2835+ // t points to the first char of a format
2836+ char * t = q + 1 ;
2837+ int m = 0 ; // m: sample id
2838+ char * end = s -> s + s -> l ;
28122839 while ( t < end )
28132840 {
28142841 // can we skip some samples?
@@ -2824,7 +2851,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
28242851 }
28252852 if ( m == bcf_hdr_nsamples (h ) ) break ;
28262853
2827- j = 0 ; // j-th format field, m-th sample
2854+ int j = 0 ; // j-th format field, m-th sample
28282855 while ( t < end )
28292856 {
28302857 fmt_aux_t * z = & fmt [j ++ ];
@@ -2835,12 +2862,13 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
28352862 return -1 ;
28362863 }
28372864 if ((z -> y >>4 & 0xf ) == BCF_HT_STR ) {
2865+ int l ;
28382866 if (z -> is_gt ) { // genotypes
28392867 int32_t is_phased = 0 ;
28402868 uint32_t * x = (uint32_t * )(z -> buf + z -> size * (size_t )m );
28412869 uint32_t unreadable = 0 ;
28422870 uint32_t max = 0 ;
2843- overflow = 0 ;
2871+ int overflow = 0 ;
28442872 for (l = 0 ;; ++ t ) {
28452873 if (* t == '.' ) {
28462874 ++ t , x [l ++ ] = is_phased ;
@@ -2867,16 +2895,17 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
28672895 for (; l < z -> size >>2 ; ++ l ) x [l ] = bcf_int32_vector_end ;
28682896 } else {
28692897 char * x = (char * )z -> buf + z -> size * (size_t )m ;
2870- for (r = t , l = 0 ; * t != ':' && * t ; ++ t ) x [l ++ ] = * t ;
2898+ for (l = 0 ; * t != ':' && * t ; ++ t ) x [l ++ ] = * t ;
28712899 for (; l < z -> size ; ++ l ) x [l ] = 0 ;
28722900 }
28732901 } else if ((z -> y >>4 & 0xf ) == BCF_HT_INT ) {
28742902 int32_t * x = (int32_t * )(z -> buf + z -> size * (size_t )m );
2903+ int l ;
28752904 for (l = 0 ;; ++ t ) {
28762905 if (* t == '.' ) {
28772906 x [l ++ ] = bcf_int32_missing , ++ t ; // ++t to skip "."
28782907 } else {
2879- overflow = 0 ;
2908+ int overflow = 0 ;
28802909 char * te ;
28812910 long int tmp_val = hts_str2int (t , & te , sizeof (tmp_val )* CHAR_BIT , & overflow );
28822911 if ( te == t || overflow || tmp_val < BCF_MIN_BT_INT32 || tmp_val > BCF_MAX_BT_INT32 )
@@ -2897,11 +2926,12 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
28972926 for (; l < z -> size >>2 ; ++ l ) x [l ] = bcf_int32_vector_end ;
28982927 } else if ((z -> y >>4 & 0xf ) == BCF_HT_REAL ) {
28992928 float * x = (float * )(z -> buf + z -> size * (size_t )m );
2929+ int l ;
29002930 for (l = 0 ;; ++ t ) {
29012931 if (* t == '.' && !isdigit_c (t [1 ])) {
29022932 bcf_float_set_missing (x [l ++ ]), ++ t ; // ++t to skip "."
29032933 } else {
2904- overflow = 0 ;
2934+ int overflow = 0 ;
29052935 char * te ;
29062936 float tmp_val = hts_str2dbl (t , & te , & overflow );
29072937 if ( (te == t || overflow ) && !extreme_val_warned )
@@ -2940,6 +2970,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
29402970
29412971 for (; j < v -> n_fmt ; ++ j ) { // fill end-of-vector values
29422972 fmt_aux_t * z = & fmt [j ];
2973+ int l ;
29432974 if ((z -> y >>4 & 0xf ) == BCF_HT_STR ) {
29442975 if (z -> is_gt ) {
29452976 int32_t * x = (int32_t * )(z -> buf + z -> size * (size_t )m );
@@ -2964,7 +2995,12 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
29642995 m ++ ; t ++ ;
29652996 }
29662997
2967- // write individual genotype information
2998+ return 0 ;
2999+ }
3000+
3001+ // write individual genotype information
3002+ static int vcf_parse_format_gt6 (kstring_t * s , const bcf_hdr_t * h , bcf1_t * v ,
3003+ char * p , char * q , fmt_aux_t * fmt ) {
29683004 kstring_t * str = & v -> indiv ;
29693005 int i ;
29703006 if (v -> n_sample > 0 ) {
@@ -2988,6 +3024,11 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
29883024 }
29893025 }
29903026
3027+ return 0 ;
3028+ }
3029+
3030+ // validity checking
3031+ static int vcf_parse_format_check7 (const bcf_hdr_t * h , bcf1_t * v ) {
29913032 if ( v -> n_sample != bcf_hdr_nsamples (h ) )
29923033 {
29933034 hts_log_error ("Number of columns at %s:%" PRIhts_pos " does not match the number of samples (%d vs %d)" ,
@@ -3008,6 +3049,47 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
30083049 return 0 ;
30093050}
30103051
3052+ // p,q is the start and the end of the FORMAT field
3053+ static int vcf_parse_format (kstring_t * s , const bcf_hdr_t * h , bcf1_t * v , char * p , char * q )
3054+ {
3055+ if ( !bcf_hdr_nsamples (h ) ) return 0 ;
3056+ kstring_t * mem = (kstring_t * )& h -> mem ;
3057+ mem -> l = 0 ;
3058+
3059+ fmt_aux_t fmt [MAX_N_FMT ];
3060+
3061+ // detect FORMAT "."
3062+ int ret ; // +ve = ok, -ve = err
3063+ if ((ret = vcf_parse_format_empty1 (s , h , v , p , q )))
3064+ return ret ? 0 : -1 ;
3065+
3066+ // get format information from the dictionary
3067+ if (vcf_parse_format_dict2 (s , h , v , p , q , fmt ) < 0 )
3068+ return -1 ;
3069+
3070+ // compute max
3071+ if (vcf_parse_format_max3 (s , h , v , p , q , fmt ) < 0 )
3072+ return -1 ;
3073+
3074+ // allocate memory for arrays
3075+ if (vcf_parse_format_alloc4 (s , h , v , p , q , fmt ) < 0 )
3076+ return -1 ;
3077+
3078+ // fill the sample fields; at beginning of the loop
3079+ if (vcf_parse_format_fill5 (s , h , v , p , q , fmt ) < 0 )
3080+ return -1 ;
3081+
3082+ // write individual genotype information
3083+ if (vcf_parse_format_gt6 (s , h , v , p , q , fmt ) < 0 )
3084+ return -1 ;
3085+
3086+ // validity checking
3087+ if (vcf_parse_format_check7 (h , v ) < 0 )
3088+ return -1 ;
3089+
3090+ return 0 ;
3091+ }
3092+
30113093static khint_t fix_chromosome (const bcf_hdr_t * h , vdict_t * d , const char * p ) {
30123094 // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
30133095 // been already printed, but will enable tools like vcfcheck to proceed.
0 commit comments