Skip to content

Commit b1acab6

Browse files
valeriuojkbonfielddaviesrob
committed
Implement and use the hts_str2dbl method (by @jkbonfield)
For faster conversion of strings to floating point values. It speeds up the conversion of values which match the regexp [+-]?[0-9]*[.]?[0-9]* and have at least one and no more than 15 digits in total. Values that don't match (for example they include an exponent) are handled by passing to strtod(). Co-Authored-By: James Bonfield <[email protected]> Co-Authored-By: Rob Davies <[email protected]>
1 parent 6d7da63 commit b1acab6

File tree

2 files changed

+115
-12
lines changed

2 files changed

+115
-12
lines changed

textutils_internal.h

Lines changed: 91 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* textutils_internal.h -- non-bioinformatics utility routines for text etc.
22
3-
Copyright (C) 2016,2018,2019 Genome Research Ltd.
3+
Copyright (C) 2016,2018-2020 Genome Research Ltd.
44
55
Author: John Marshall <[email protected]>
66
@@ -304,6 +304,96 @@ static inline uint64_t hts_str2uint(const char *in, char **end, int bits,
304304
return n;
305305
}
306306

307+
/// Convert a string to a double, with overflow detection
308+
/** @param[in] in Input string
309+
@param[out] end Returned end pointer
310+
@param[out] failed Location of overflow flag
311+
@return String value converted to a double
312+
313+
Converts a floating point value string to a double. The string should
314+
have the format [+-]?[0-9]*[.]?[0-9]* with at least one and no more than 15
315+
digits. Strings that do not match (inf, nan, values with exponents) will
316+
be passed on to strtod() for processing.
317+
318+
If the value is too big, the largest possible value will be returned;
319+
if it is too small to be represented in a double zero will be returned.
320+
In both cases errno will be set to ERANGE.
321+
322+
If no characters could be converted, *failed will be set to 1.
323+
324+
The address of the first character following the converted number will
325+
be stored in *end.
326+
327+
Both end and failed must be non-NULL.
328+
*/
329+
330+
static inline double hts_str2dbl(const char *in, char **end, int *failed) {
331+
uint64_t n = 0;
332+
int max_len = 15;
333+
const unsigned char *v = (const unsigned char *) in;
334+
const unsigned int ascii_zero = '0'; // Prevents conversion to signed
335+
int neg = 0, point = -1;
336+
double d;
337+
static double D[] = {1,1, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7,
338+
1e8, 1e9, 1e10,1e11,1e12,1e13,1e14,1e15,
339+
1e16,1e17,1e18,1e19,1e20};
340+
341+
while (isspace(*v))
342+
v++;
343+
344+
if (*v == '-') {
345+
neg = 1;
346+
v++;
347+
} else if (*v == '+') {
348+
v++;
349+
}
350+
351+
switch(*v) {
352+
case '1': case '2': case '3': case '4':
353+
case '5': case '6': case '7': case '8': case '9':
354+
break;
355+
356+
case '0':
357+
if (v[1] != 'x' && v[1] != 'X') break;
358+
// else fall through (hex number)
359+
360+
default:
361+
// Non numbers, like NaN, Inf
362+
d = strtod(in, end);
363+
if (*end == in)
364+
*failed = 1;
365+
return d;
366+
}
367+
368+
while (*v == '0') ++v;
369+
370+
const unsigned char *start = v;
371+
372+
while (--max_len && *v>='0' && *v<='9')
373+
n = n*10 + *v++ - ascii_zero;
374+
if (max_len && *v == '.') {
375+
point = v - start;
376+
v++;
377+
while (--max_len && *v>='0' && *v<='9')
378+
n = n*10 + *v++ - ascii_zero;
379+
}
380+
if (point < 0)
381+
point = v - start;
382+
383+
// Outside the scope of this quick and dirty parser.
384+
if (!max_len || *v == 'e' || *v == 'E') {
385+
d = strtod(in, end);
386+
if (*end == in)
387+
*failed = 1;
388+
return d;
389+
}
390+
391+
*end = (char *)v;
392+
d = n / D[v - start - point];
393+
394+
return neg ? -d : d;
395+
}
396+
307397

308398
#ifdef __cplusplus
309399
}

vcf.c

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2201,7 +2201,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
22012201
{
22022202
if ( !bcf_hdr_nsamples(h) ) return 0;
22032203

2204-
static int extreme_int_warned = 0;
2204+
static int extreme_val_warned = 0;
22052205
char *r, *t;
22062206
int j, l, m, g, overflow = 0;
22072207
khint_t k;
@@ -2425,18 +2425,18 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
24252425
} else if ((z->y>>4&0xf) == BCF_HT_INT) {
24262426
int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
24272427
for (l = 0;; ++t) {
2428-
if (*t == '.') x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
2429-
else
2430-
{
2428+
if (*t == '.') {
2429+
x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
2430+
} else {
24312431
overflow = 0;
24322432
char *te;
24332433
long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
24342434
if ( te==t || overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
24352435
{
2436-
if ( !extreme_int_warned )
2436+
if ( !extreme_val_warned )
24372437
{
2438-
hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos,h->id[BCF_DT_ID][fmt[j-1].key].key,bcf_seqname_safe(h,v), v->pos+1);
2439-
extreme_int_warned = 1;
2438+
hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1);
2439+
extreme_val_warned = 1;
24402440
}
24412441
tmp_val = bcf_int32_missing;
24422442
}
@@ -2450,8 +2450,20 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
24502450
} else if ((z->y>>4&0xf) == BCF_HT_REAL) {
24512451
float *x = (float*)(z->buf + z->size * (size_t)m);
24522452
for (l = 0;; ++t) {
2453-
if (*t == '.' && !isdigit_c(t[1])) bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
2454-
else x[l++] = strtod(t, &t);
2453+
if (*t == '.' && !isdigit_c(t[1])) {
2454+
bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
2455+
} else {
2456+
overflow = 0;
2457+
char *te;
2458+
float tmp_val = hts_str2dbl(t, &te, &overflow);
2459+
if ( (te==t || overflow) && !extreme_val_warned )
2460+
{
2461+
hts_log_warning("Extreme FORMAT/%s value encountered at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname(h,v), v->pos+1);
2462+
extreme_val_warned = 1;
2463+
}
2464+
x[l++] = tmp_val;
2465+
t = te;
2466+
}
24552467
if (*t != ',') break;
24562468
}
24572469
if ( !l ) bcf_float_set_missing(x[l++]); // An empty field, insert missing value
@@ -2768,8 +2780,9 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p
27682780
float *val_f = (float *)a_val;
27692781
for (i = 0, t = val; i < n_val; ++i, ++t)
27702782
{
2771-
val_f[i] = strtod(t, &te);
2772-
if ( te==t ) // conversion failed
2783+
overflow = 0;
2784+
val_f[i] = hts_str2dbl(t, &te, &overflow);
2785+
if ( te==t || overflow ) // conversion failed
27732786
bcf_float_set_missing(val_f[i]);
27742787
for (t = te; *t && *t != ','; t++);
27752788
}

0 commit comments

Comments
 (0)