@@ -78,6 +78,7 @@ static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, N
7878
7979#define BCF_IS_64BIT (1<<30)
8080
81+ static int vcf_parse_format_partial (bcf1_t * v );
8182
8283static char * find_chrom_header_line (char * s )
8384{
@@ -1439,6 +1440,11 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
14391440static inline uint8_t * bcf_unpack_fmt_core1 (uint8_t * ptr , int n_sample , bcf_fmt_t * fmt );
14401441int bcf_subset_format (const bcf_hdr_t * hdr , bcf1_t * rec )
14411442{
1443+ if (rec -> unpacked & VCF_UN_FMT ) {
1444+ if (vcf_parse_format_partial (rec ) < 0 )
1445+ return -1 ;
1446+ }
1447+
14421448 if ( !hdr -> keep_samples ) return 0 ;
14431449 if ( !bcf_hdr_nsamples (hdr ) )
14441450 {
@@ -1734,16 +1740,23 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
17341740 if ( h -> dirty ) {
17351741 if (bcf_hdr_sync (h ) < 0 ) return -1 ;
17361742 }
1743+
1744+ if ( hfp -> format .format == vcf || hfp -> format .format == text_format )
1745+ return vcf_write (hfp ,h ,v );
1746+
1747+ if (v -> unpacked & VCF_UN_FMT ) {
1748+ // slow, but round trip test
1749+ if (vcf_parse_format_partial (v ) < 0 )
1750+ return -1 ;
1751+ }
1752+
17371753 if ( bcf_hdr_nsamples (h )!= v -> n_sample )
17381754 {
17391755 hts_log_error ("Broken VCF record, the number of columns at %s:%" PRIhts_pos " does not match the number of samples (%d vs %d)" ,
17401756 bcf_seqname_safe (h ,v ), v -> pos + 1 , v -> n_sample , bcf_hdr_nsamples (h ));
17411757 return -1 ;
17421758 }
17431759
1744- if ( hfp -> format .format == vcf || hfp -> format .format == text_format )
1745- return vcf_write (hfp ,h ,v );
1746-
17471760 if ( v -> errcode )
17481761 {
17491762 // vcf_parse1() encountered a new contig or tag, undeclared in the
@@ -2569,6 +2582,45 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
25692582 return 0 ;
25702583}
25712584
2585+ // The indiv.s kstring is the textual VCF representation for FORMAT
2586+ // column and all the subsequent SAMPLE columns.
2587+ //
2588+ // We also need the header, and this cannot be passed in as bcf_unpack calls
2589+ // this and it doesn't have the header available.
2590+ // So we also cache a pointer to the header in the first bytes of the
2591+ // kstring too.
2592+ //
2593+ // This packing ensures the data is still in kstring format and amenable
2594+ // to freeing / reuse. I.e.:
2595+ //
2596+ // s.s p q s.l s.m
2597+ // | | | | |
2598+ // <HDR_PTR><FORMAT><SAMPLE\tSAMPLE>.........
2599+ //
2600+ // Returns 0 on success,
2601+ // <0 on failure.
2602+ static int vcf_parse_format_partial (bcf1_t * v ) {
2603+ if (!(v -> unpacked & VCF_UN_FMT ))
2604+ return 0 ;
2605+ kstring_t s = v -> indiv ;
2606+ const bcf_hdr_t * h = * (const bcf_hdr_t * * )s .s ;
2607+ char * p = s .s + sizeof (const bcf_hdr_t * );
2608+ char * q = p + strlen (p );
2609+
2610+ v -> indiv .s = NULL ;
2611+ v -> indiv .l = v -> indiv .m = 0 ;
2612+
2613+ int ret ;
2614+ if ((ret = vcf_parse_format (& s , h , v , p , q ) == 0 )) {
2615+ v -> unpacked &= ~VCF_UN_FMT ;
2616+ free (s .s );
2617+ return ret ;
2618+ } else {
2619+ v -> indiv = s ;
2620+ return ret ;
2621+ }
2622+ }
2623+
25722624static khint_t fix_chromosome (const bcf_hdr_t * h , vdict_t * d , const char * p ) {
25732625 // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
25742626 // been already printed, but will enable tools like vcfcheck to proceed.
@@ -2890,7 +2942,29 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
28902942 }
28912943 if ( v -> max_unpack && !(v -> max_unpack >>3 ) ) goto end ;
28922944 } else if (i == 8 ) {// FORMAT
2893- return vcf_parse_format (s , h , v , p , q ) == 0 ? 0 : -2 ;
2945+ // Consider complete copy of s, obtained via ks_release,
2946+ // and cache of p/q pointers. This is then a generalised
2947+ // parse delay that works for any max_unpack value.
2948+
2949+ kstring_t * iv = & v -> indiv ;
2950+ ks_clear (iv );
2951+ if (kputsn ((char * )& h , sizeof (& h ), iv ) < 0 ||
2952+ kputsn (p , s -> s + s -> l - p , iv ) < 0 )
2953+ goto err ;
2954+
2955+ v -> unpacked |= VCF_UN_FMT ;
2956+
2957+ // We can't accurately judge n_sample and some things may check
2958+ // this without doing an explicit bcf_unpack call, but we
2959+ // set it to bcf_hdr_nsamples(h) as a starting point.
2960+ // (vcf_parse_format validates this and it's an error if it
2961+ // mismatches, so this is initial value is incorrect if the
2962+ // data itself is incorrect, and we'll spot that on an explict
2963+ // bcf_unpack call.)
2964+ v -> n_sample = bcf_hdr_nsamples (h );
2965+
2966+ return 0 ;
2967+ //return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2;
28942968 }
28952969 }
28962970
@@ -3002,6 +3076,10 @@ int bcf_unpack(bcf1_t *b, int which)
30023076 ptr = bcf_unpack_info_core1 (ptr , & d -> info [i ]);
30033077 b -> unpacked |= BCF_UN_INFO ;
30043078 }
3079+ if ((which & BCF_UN_FMT ) && (b -> unpacked & VCF_UN_FMT )) {
3080+ if (vcf_parse_format_partial (b ) < 0 )
3081+ return -1 ;
3082+ }
30053083 if ((which & BCF_UN_FMT ) && b -> n_sample && !(b -> unpacked & BCF_UN_FMT )) { // FORMAT
30063084 ptr = (uint8_t * )b -> indiv .s ;
30073085 hts_expand (bcf_fmt_t , b -> n_fmt , d -> m_fmt , d -> fmt );
@@ -3016,7 +3094,8 @@ int bcf_unpack(bcf1_t *b, int which)
30163094int vcf_format (const bcf_hdr_t * h , const bcf1_t * v , kstring_t * s )
30173095{
30183096 int i ;
3019- bcf_unpack ((bcf1_t * )v , BCF_UN_ALL );
3097+ bcf_unpack ((bcf1_t * )v , (v -> unpacked & VCF_UN_FMT )
3098+ ? BCF_UN_SHR : BCF_UN_ALL );
30203099 kputs (h -> id [BCF_DT_CTG ][v -> rid ].key , s ); // CHROM
30213100 kputc ('\t' , s ); kputll (v -> pos + 1 , s ); // POS
30223101 kputc ('\t' , s ); kputs (v -> d .id ? v -> d .id : "." , s ); // ID
@@ -3075,45 +3154,54 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
30753154 if ( first ) kputc ('.' , s );
30763155 } else kputc ('.' , s );
30773156 // FORMAT and individual information
3078- if (v -> n_sample )
3079- {
3080- int i ,j ;
3081- if ( v -> n_fmt )
3157+ if (v -> unpacked & VCF_UN_FMT ) {
3158+ size_t l = strlen (v -> indiv .s + sizeof (bcf_hdr_t * ));
3159+ kputc ('\t' , s );
3160+ kputsn (v -> indiv .s + sizeof (bcf_hdr_t * ), l , s );
3161+ kputc ('\t' , s );
3162+ kputsn (v -> indiv .s + sizeof (bcf_hdr_t * ) + l + 1 ,
3163+ v -> indiv .l - (sizeof (bcf_hdr_t * ) + l + 1 ), s );
3164+ } else {
3165+ if (v -> n_sample )
30823166 {
3083- int gt_i = -1 ;
3084- bcf_fmt_t * fmt = v -> d .fmt ;
3085- int first = 1 ;
3086- for (i = 0 ; i < (int )v -> n_fmt ; ++ i ) {
3087- if ( !fmt [i ].p ) continue ;
3088- kputc (!first ? ':' : '\t' , s ); first = 0 ;
3089- if ( fmt [i ].id < 0 ) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) )
3090- {
3091- hts_log_error ("Invalid BCF, the FORMAT tag id=%d at %s:%" PRIhts_pos " not present in the header" , fmt [i ].id , bcf_seqname_safe (h ,(bcf1_t * )v ), v -> pos + 1 );
3092- abort ();
3093- }
3094- kputs (h -> id [BCF_DT_ID ][fmt [i ].id ].key , s );
3095- if (strcmp (h -> id [BCF_DT_ID ][fmt [i ].id ].key , "GT" ) == 0 ) gt_i = i ;
3096- }
3097- if ( first ) kputs ("\t." , s );
3098- for (j = 0 ; j < v -> n_sample ; ++ j ) {
3099- kputc ('\t' , s );
3100- first = 1 ;
3167+ int i ,j ;
3168+ if ( v -> n_fmt )
3169+ {
3170+ int gt_i = -1 ;
3171+ bcf_fmt_t * fmt = v -> d .fmt ;
3172+ int first = 1 ;
31013173 for (i = 0 ; i < (int )v -> n_fmt ; ++ i ) {
3102- bcf_fmt_t * f = & fmt [i ];
3103- if ( !f -> p ) continue ;
3104- if (!first ) kputc (':' , s );
3105- first = 0 ;
3106- if (gt_i == i )
3107- bcf_format_gt (f ,j ,s );
3108- else
3109- bcf_fmt_array (s , f -> n , f -> type , f -> p + j * (size_t )f -> size );
3174+ if ( !fmt [i ].p ) continue ;
3175+ kputc (!first ? ':' : '\t' , s ); first = 0 ;
3176+ if ( fmt [i ].id < 0 ) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) )
3177+ {
3178+ hts_log_error ("Invalid BCF, the FORMAT tag id=%d at %s:%" PRIhts_pos " not present in the header" , fmt [i ].id , bcf_seqname_safe (h ,(bcf1_t * )v ), v -> pos + 1 );
3179+ abort ();
3180+ }
3181+ kputs (h -> id [BCF_DT_ID ][fmt [i ].id ].key , s );
3182+ if (strcmp (h -> id [BCF_DT_ID ][fmt [i ].id ].key , "GT" ) == 0 ) gt_i = i ;
3183+ }
3184+ if ( first ) kputs ("\t." , s );
3185+ for (j = 0 ; j < v -> n_sample ; ++ j ) {
3186+ kputc ('\t' , s );
3187+ first = 1 ;
3188+ for (i = 0 ; i < (int )v -> n_fmt ; ++ i ) {
3189+ bcf_fmt_t * f = & fmt [i ];
3190+ if ( !f -> p ) continue ;
3191+ if (!first ) kputc (':' , s );
3192+ first = 0 ;
3193+ if (gt_i == i )
3194+ bcf_format_gt (f ,j ,s );
3195+ else
3196+ bcf_fmt_array (s , f -> n , f -> type , f -> p + j * (size_t )f -> size );
3197+ }
3198+ if ( first ) kputc ('.' , s );
31103199 }
3111- if ( first ) kputc ('.' , s );
31123200 }
3201+ else
3202+ for (j = 0 ; j <=v -> n_sample ; j ++ )
3203+ kputs ("\t." , s );
31133204 }
3114- else
3115- for (j = 0 ; j <=v -> n_sample ; j ++ )
3116- kputs ("\t." , s );
31173205 }
31183206 kputc ('\n' , s );
31193207 return 0 ;
@@ -3824,6 +3912,11 @@ int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
38243912
38253913int bcf_subset (const bcf_hdr_t * h , bcf1_t * v , int n , int * imap )
38263914{
3915+ if (v -> unpacked & VCF_UN_FMT ) {
3916+ if (vcf_parse_format_partial (v ) < 0 )
3917+ return -1 ;
3918+ }
3919+
38273920 kstring_t ind ;
38283921 ind .s = 0 ; ind .l = ind .m = 0 ;
38293922 if (n ) {
0 commit comments