Skip to content

Commit b8dbaf4

Browse files
author
Shashi Gowda
committed
allow space-delimited files
1 parent 71a7578 commit b8dbaf4

File tree

3 files changed

+73
-61
lines changed

3 files changed

+73
-61
lines changed

src/csv.jl

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,8 @@ end
3737
optionsiter(opts::AbstractVector, header) = optionsiter(opts)
3838

3939
tofield(f::AbstractField, opts) = f
40-
tofield(f::AbstractToken, opts) =
41-
Field(f, delim=opts.endchar)
42-
tofield(f::StringToken, opts) =
43-
Field(Quoted(f), delim=opts.endchar)
40+
tofield(f::AbstractToken, opts) = Field(f)
41+
tofield(f::StringToken, opts) = Field(Quoted(f))
4442
tofield(f::Type, opts) = tofield(fromtype(f), opts)
4543
tofield(f::Type{String}, opts) = tofield(fromtype(StrRange), opts)
4644
tofield(f::DateFormat, opts) = tofield(DateTimeToken(DateTime, f), opts)
@@ -56,6 +54,7 @@ Read CSV from `file`. Returns a tuple of 2 elements:
5654
5755
- `file`: either an IO object or file name string
5856
- `delim`: the delimiter character
57+
- `spacedelim`: (Bool) parse space-delimited files. `delim` has no effect if true.
5958
- `quotechar`: character used to quote strings, defaults to `"`
6059
- `escapechar`: character used to escape quotechar in strings. (could be the same as quotechar)
6160
- `pooledstrings`: whether to try and create PooledArray of strings
@@ -134,6 +133,7 @@ end
134133

135134
# read CSV in a string
136135
function _csvread_internal(str::AbstractString, delim=',';
136+
spacedelim=false,
137137
quotechar='"',
138138
escapechar='\\',
139139
pooledstrings=true,
@@ -154,7 +154,7 @@ function _csvread_internal(str::AbstractString, delim=',';
154154
filename=nothing,
155155
type_detect_rows=20)
156156

157-
opts = LocalOpts(delim, quotechar, escapechar, false, false)
157+
opts = LocalOpts(delim, spacedelim, quotechar, escapechar, false, false)
158158
len = endof(str)
159159
pos = start(str)
160160
rowlength_sum = 0 # sum of lengths of rows, for estimating nrows
@@ -292,7 +292,7 @@ function _csvread_internal(str::AbstractString, delim=',';
292292
if l !== endof(str) && err.pos >= l && !field.eoldelim
293293
if fieldtype(field) <: AbstractString || fieldtype(field) <: StrRange
294294
# retry assuming newlines can be part of the field
295-
wopts = LocalOpts(opts.endchar, opts.quotechar, opts.escapechar, opts.includequotes, true)
295+
wopts = LocalOpts(opts.endchar, opts.spacedelim, opts.quotechar, opts.escapechar, opts.includequotes, true)
296296
fieldsvec = Any[rec.fields...]
297297
fieldsvec[err.colno] = swapinner(field, WrapLocalOpts(wopts, field.inner))
298298
rec = Record((fieldsvec...))

src/field.jl

Lines changed: 22 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -25,20 +25,22 @@ function tryparsenext end
2525
2626
Options local to the token currently being parsed.
2727
- `endchar`: Till where to parse. (e.g. delimiter or quote ending character)
28+
- `spacedelim`: Treat spaces as delimiters
2829
- `quotechar`: the quote character
2930
- `escapechar`: char that escapes the quote
3031
- `includequotes`: whether to include quotes while parsing
3132
- `includenewlines`: whether to include newlines while parsing
3233
"""
3334
immutable LocalOpts
3435
endchar::Char # End parsing at this char
36+
spacedelim::Bool
3537
quotechar::Char # Quote char
3638
escapechar::Char # Escape char
3739
includequotes::Bool # Whether to include quotes in string parsing
3840
includenewlines::Bool # Whether to include newlines in string parsing
3941
end
4042

41-
const default_opts = LocalOpts(',', '"', '\\', false, false)
43+
const default_opts = LocalOpts(',', false, '"', '\\', false, false)
4244
# helper function for easy testing:
4345
@inline function tryparsenext(tok::AbstractToken, str, opts::LocalOpts=default_opts)
4446
tryparsenext(tok, str, start(str), endof(str), opts)
@@ -220,7 +222,9 @@ function tryparsenext{T}(s::StringToken{T}, str, i, len, opts)
220222

221223
while i <= len
222224
c, ii = next(str, i)
223-
if c == opts.endchar
225+
if opts.spacedelim && c == ' ' || c == '\t'
226+
break
227+
elseif !opts.spacedelim && c == opts.endchar
224228
if opts.endchar == opts.quotechar
225229
# this means we're inside a quoted string
226230
if opts.quotechar == opts.escapechar
@@ -251,6 +255,8 @@ function tryparsenext{T}(s::StringToken{T}, str, i, len, opts)
251255
i = ii
252256
end
253257
break
258+
elseif opts.spacedelim && c == ' ' || c == '\t'
259+
break
254260
elseif (!opts.includenewlines && isnewline(c))
255261
break
256262
end
@@ -357,7 +363,7 @@ function tryparsenext{T}(q::Quoted{T}, str, i, len, opts)
357363
end
358364

359365
if quotestarted
360-
qopts = LocalOpts(quotechar(q, opts), quotechar(q, opts), escapechar(q, opts),
366+
qopts = LocalOpts(quotechar(q, opts), false, quotechar(q, opts), escapechar(q, opts),
361367
q.includequotes, q.includenewlines)
362368
@chk2 x, i = tryparsenext(q.inner, str, i, len, qopts)
363369
else
@@ -420,7 +426,7 @@ function tryparsenext{T}(dt::DateTimeToken{T}, str, i, len, opts)
420426
if isnull(nt)
421427
return R(), i
422428
else
423-
return R(T(unsafe_get(nt)...)), i
429+
return R(T(nt.value...)), i
424430
end
425431
end
426432

@@ -491,7 +497,7 @@ function tryparsenext{T}(na::NAToken{T}, str, i, len, opts)
491497
return R(T(x)), ii
492498

493499
@label maybe_null
494-
naopts = LocalOpts(endchar(na,opts), opts.quotechar,
500+
naopts = LocalOpts(endchar(na,opts), opts.spacedelim, opts.quotechar,
495501
opts.escapechar, false, opts.includenewlines)
496502
@chk2 nastr, ii = tryparsenext(StringToken(String), str, i, len, naopts)
497503
if !isempty(searchsorted(na.nastrings, nastr))
@@ -520,40 +526,30 @@ immutable Field{T,S<:AbstractToken} <: AbstractField{T}
520526
ignore_init_whitespace::Bool
521527
ignore_end_whitespace::Bool
522528
eoldelim::Bool
523-
spacedelim::Bool
524-
delim::Nullable{Char}
525529
end
526530

527-
function Field{S}(inner::S; ignore_init_whitespace=true, ignore_end_whitespace=true,
528-
eoldelim=false, spacedelim=false, delim=Nullable{Char}())
531+
function Field{S}(inner::S; ignore_init_whitespace=true, ignore_end_whitespace=true, eoldelim=false)
529532
T = fieldtype(inner)
530-
Field{T,S}(inner, ignore_init_whitespace, ignore_end_whitespace,
531-
eoldelim, spacedelim, delim)
533+
Field{T,S}(inner, ignore_init_whitespace, ignore_end_whitespace, eoldelim)
532534
end
533535

534536
function Field(f::Field; inner=f.inner, ignore_init_whitespace=f.ignore_init_whitespace,
535537
ignore_end_whitespace=f.ignore_end_whitespace,
536-
eoldelim=f.eoldelim, spacedelim=f.spacedelim, delim=f.delim)
538+
eoldelim=f.eoldelim)
537539
T = fieldtype(inner)
538-
Field{T,typeof(inner)}(inner, ignore_init_whitespace, ignore_end_whitespace,
539-
eoldelim, spacedelim, delim)
540+
Field{T,typeof(inner)}(inner, ignore_init_whitespace,
541+
ignore_end_whitespace, eoldelim)
540542
end
541543

542-
@inline delim(f::Field, opts) = get(f.delim, opts.endchar)
543-
544544
function swapinner(f::Field, inner::AbstractToken;
545545
ignore_init_whitespace= f.ignore_end_whitespace
546546
, ignore_end_whitespace=f.ignore_end_whitespace
547547
, eoldelim=f.eoldelim
548-
, spacedelim=f.spacedelim
549-
, delim=f.delim
550548
)
551549
Field(inner;
552550
ignore_init_whitespace=ignore_end_whitespace
553551
, ignore_end_whitespace=ignore_end_whitespace
554552
, eoldelim=eoldelim
555-
, spacedelim=spacedelim
556-
, delim=delim
557553
)
558554

559555
end
@@ -574,13 +570,14 @@ function tryparsenext{T}(f::Field{T}, str, i, len, opts)
574570
i0 = i
575571
while i <= len
576572
@inbounds c, ii = next(str, i)
577-
!isspace(c) && break
573+
!opts.spacedelim && opts.endchar == '\t' && c == '\t' && (i =ii; @goto done)
574+
!isspace(c) && c != '\t' && break
578575
i = ii
579-
delim(f, opts) == '\t' && c == '\t' && @goto done
580576
end
581577

582-
f.spacedelim && i > i0 && @goto done
578+
opts.spacedelim && i > i0 && @goto done
583579
end
580+
# todo don't ignore whitespace AND spacedelim
584581

585582
if i > len
586583
if f.eoldelim
@@ -591,8 +588,8 @@ function tryparsenext{T}(f::Field{T}, str, i, len, opts)
591588
end
592589

593590
@inbounds c, ii = next(str, i)
594-
delim(f, opts) == c && (i=ii; @goto done)
595-
f.spacedelim && isspace(c) && (i=ii; @goto done)
591+
opts.spacedelim && (isspace(c) || c == '\t') && (i=ii; @goto done)
592+
!opts.spacedelim && opts.endchar == c && (i=ii; @goto done)
596593

597594
if f.eoldelim
598595
if c == '\r'

test/runtests.jl

Lines changed: 45 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -51,30 +51,32 @@ import TextParse: StringToken
5151
@test tryparsenext(StringToken(String), "x\ny") |> unwrap == ("x", 2)
5252
@test tryparsenext(StringToken(String), "x,y") |> unwrap == ("x", 2) # test escape
5353

54-
opts = LocalOpts(',', '"', '"', true, true)
54+
opts = LocalOpts(',', false, '"', '"', true, true)
5555
@test tryparsenext(StringToken(String), "", opts) |> unwrap == ("", 1)
5656
@test tryparsenext(StringToken(String), "\"\"", opts) |> unwrap == ("\"\"", 3)
5757
@test tryparsenext(StringToken(String), "x", opts) |> unwrap == ("x", 2)
5858
# test including new lines
5959
@test tryparsenext(StringToken(String), "x\ny", opts) |> unwrap == ("x\ny", 4)
6060
@test tryparsenext(StringToken(String), "\"x\ny\"", opts) |> unwrap == ("\"x\ny\"", 6)
6161

62-
opts = LocalOpts(',', '"', '"', false, true)
62+
opts = LocalOpts(',', false, '"', '"', false, true)
6363
# test that includequotes option doesn't affect string
6464
@test tryparsenext(StringToken(String), "\"\"", opts) |> unwrap == ("\"\"", 3)
6565

66-
opts = LocalOpts(',', '"', '\\', false, false)
66+
opts = LocalOpts(',', false, '"', '\\', false, false)
6767
str = "Owner 2 ”Vicepresident\"\""
6868
@test tryparsenext(Quoted(String), str) |> unwrap == (str, endof(str)+1)
6969
str1 = "\"Owner 2 ”Vicepresident\"\"\""
7070
@test tryparsenext(Quoted(String,quotechar=Nullable('"'), escapechar=Nullable('"')), str1) |> unwrap == (str, endof(str1)+1)
7171

72+
opts = LocalOpts(',', true, '"', '\\', false, false)
73+
@test tryparsenext(StringToken(String), "x y",1,3, opts) |> unwrap == ("x", 2)
7274
end
7375

7476

7577
import TextParse: Quoted, NAToken, Unknown
7678
@testset "Quoted string parsing" begin
77-
opts = LocalOpts(',', '"', '"', true, true)
79+
opts = LocalOpts(',', false, '"', '"', true, true)
7880

7981
@test tryparsenext(Quoted(String), "\"\"") |> unwrap == ("", 3)
8082
@test tryparsenext(Quoted(String), "\"\" ", opts) |> unwrap == ("", 3)
@@ -103,12 +105,15 @@ import TextParse: Quoted, NAToken, Unknown
103105
@test tryparsenext(Quoted(NAToken(fromtype(Int))), "\"\"") |> unwrap |> failedat == 3
104106
@test tryparsenext(Quoted(NAToken(fromtype(Int))), "\"21\"") |> unwrap |> unwrap == (21, 5)
105107
@test isnull(tryparsenext(Quoted(NAToken(Unknown())), " ") |> unwrap |> first)
106-
opts = LocalOpts(',', '"', '"', false, false)
108+
opts = LocalOpts(',', false,'"', '"', false, false)
107109
@test tryparsenext(Quoted(StringToken(String)), "x,", opts) |> unwrap == ("x", 2)
108110

109111
# stripspaces
110112
@test tryparsenext(Quoted(Percentage()), "\" 10%\",", opts) |> unwrap == (0.1, 7)
111113
@test tryparsenext(Quoted(String), "\" 10%\",", opts) |> unwrap == (" 10%", 7)
114+
opts = LocalOpts(',', true,'"', '"', false, false)
115+
@test tryparsenext(Quoted(StringToken(String)), "\"x y\" y", opts) |> unwrap == ("x y", 6)
116+
@test tryparsenext(Quoted(StringToken(String)), "x y", opts) |> unwrap == ("x", 2)
112117
end
113118

114119
@testset "NA parsing" begin
@@ -121,21 +126,24 @@ end
121126
import TextParse: Field
122127
@testset "Field parsing" begin
123128
f = fromtype(Int)
124-
@test tryparsenext(Field(f,delim=','), "12,3") |> unwrap == (12, 4)
125-
@test tryparsenext(Field(f,delim=','), "12 ,3") |> unwrap == (12, 5)
126-
@test tryparsenext(Field(f,delim=','), " 12 ,3") |> unwrap == (12, 6)
127-
@test tryparsenext(Field(f,delim='\t'), "12\t3") |> unwrap == (12, 4)
128-
@test tryparsenext(Field(f,delim='\t'), "12 \t3") |> unwrap == (12, 5)
129-
@test tryparsenext(Field(f,delim='\t'), " 12 \t 3") |> unwrap == (12, 6)
130-
@test tryparsenext(Field(f,spacedelim=true), " 12 3") |> unwrap == (12, 5)
131-
@test tryparsenext(Field(f,spacedelim=true), " 12 3") |> unwrap == (12, 5)
132-
@test tryparsenext(Field(f,spacedelim=true, ignore_end_whitespace=false), " 12 \t 3") |> unwrap == (12, 5)
133-
@test tryparsenext(Field(f,ignore_end_whitespace=false, delim=' '), "12 3") |> unwrap == (12, 4)
134-
@test tryparsenext(Field(f,ignore_end_whitespace=false, delim='\t'), "12 \t3") |> failedat == 3
135-
@test tryparsenext(Field(f,ignore_end_whitespace=false, delim='\t'), " 12\t 3") |> unwrap == (12,5)
136-
@test tryparsenext(Field(f,eoldelim=true, delim='\t'), " 12\n") |> unwrap == (12,5)
129+
@test tryparsenext(Field(f), "12,3") |> unwrap == (12, 4)
130+
@test tryparsenext(Field(f), "12 ,3") |> unwrap == (12, 5)
131+
@test tryparsenext(Field(f), " 12 ,3") |> unwrap == (12, 6)
132+
opts = LocalOpts('\t', false, 'x','x',true,false)
133+
@test tryparsenext(Field(f), "12\t3", 1, 4, opts) |> unwrap == (12, 4)
134+
@test tryparsenext(Field(f), "12 \t3", 1, 5, opts) |> unwrap == (12, 5)
135+
@test tryparsenext(Field(f), " 12 \t 3", 1, 6, opts) |> unwrap == (12, 6)
136+
opts = LocalOpts('\t', true, 'x','x',true,false)
137+
@test tryparsenext(Field(f), " 12 3", 1, 5, opts) |> unwrap == (12, 5)
138+
@test tryparsenext(Field(f, ignore_end_whitespace=false), " 12 \t 3", 1,6, opts) |> unwrap == (12, 5)
139+
opts = LocalOpts(' ', false, 'x','x',false, false)
140+
@test tryparsenext(Field(f,ignore_end_whitespace=false), "12 3", 1,4,opts) |> unwrap == (12, 4)
141+
# @test tryparsenext(Field(f,ignore_end_whitespace=false), "12 \t3", 1,5,opts) |> failedat == 3
142+
opts = LocalOpts('\t', false, 'x','x',false, false)
143+
@test tryparsenext(Field(f,ignore_end_whitespace=false), " 12\t 3", 1, 6, opts) |> unwrap == (12,5)
144+
@test tryparsenext(Field(f,eoldelim=true), " 12\n", 1, 4, opts) |> unwrap == (12,5)
145+
@test tryparsenext(Field(f,eoldelim=true), " 12\n\r\n", 1, 5, opts) |> unwrap == (12,6)
137146
@test tryparsenext(Field(f,eoldelim=true), " 12") |> unwrap == (12,4)
138-
@test tryparsenext(Field(f,eoldelim=true, delim='\t'), " 12\n\r\n") |> unwrap == (12,6)
139147
end
140148

141149

@@ -151,26 +159,26 @@ end
151159

152160
import TextParse: UseOne
153161
@testset "UseOne" begin
154-
f = UseOne((Field(fromtype(Int), delim=';'), Field(fromtype(Float64)), Field(fromtype(Int), eoldelim=true)), 3)
155-
@test tryparsenext(f, "1; 33.21, 45", 1, 12) |> unwrap == (45, 13)
162+
f = UseOne((Field(fromtype(Int)), Field(fromtype(Float64)), Field(fromtype(Int), eoldelim=true)), 3)
163+
@test tryparsenext(f, "1, 33.21, 45", 1, 12) |> unwrap == (45, 13)
156164
end
157165

158166
import TextParse: Repeated
159167
@testset "Repeated" begin
160-
f = Repeated(Field(fromtype(Int), delim=';'), 3)
161-
@test tryparsenext(f, "1; 33; 45;", 1, 12) |> unwrap == ((1,33,45), 11)
168+
f = Repeated(Field(fromtype(Int)), 3)
169+
@test tryparsenext(f, "1, 33, 45,", 1, 12) |> unwrap == ((1,33,45), 11)
162170

163-
inp = join(map(string, [1:45;]), "; ") * "; "
171+
inp = join(map(string, [1:45;]), ", ") * ", "
164172
out = ntuple(identity, 45)
165-
f2 = Repeated(Field(fromtype(Int), delim=';'), 45)
173+
f2 = Repeated(Field(fromtype(Int)), 45)
166174
@test tryparsenext(f2, inp, 1, length(inp)) |> unwrap == (out, length(inp))
167175
#@benchmark tryparsenext($f2, $inp, 1, length($inp))
168176
end
169177

170178

171179
import TextParse: quotedsplit
172180
@testset "quotedsplit" begin
173-
opts = LocalOpts(',', '"', '\\', false, false)
181+
opts = LocalOpts(',', false, '"', '\\', false, false)
174182
@test quotedsplit("x", opts, false, 1, 1) == ["x"]
175183
@test quotedsplit("x, y", opts, false, 1, 4) == ["x", "y"]
176184
@test quotedsplit("\"x\", \"y\"", opts,false, 1, 8) == ["x", "y"]
@@ -180,7 +188,7 @@ import TextParse: quotedsplit
180188
@test quotedsplit(",", opts, true, 1, 1) == ["", ""]
181189
@test quotedsplit(", ", opts, false, 1, 2) == ["", ""]
182190
str = "1, \"x \"\"y\"\" z\", 1"
183-
qopts = LocalOpts(',', '"', '"', false, false)
191+
qopts = LocalOpts(',', false,'"', '"', false, false)
184192
@test quotedsplit(str, qopts,true, 1, endof(str)) == ["1", "\"x \"\"y\"\" z\"", "1"]
185193
end
186194

@@ -198,7 +206,7 @@ import TextParse: LocalOpts, readcolnames
198206
str2 = """
199207
a, " b", "c", "d\\" e "
200208
"""
201-
opts = LocalOpts(',', '"', '\\', false, false)
209+
opts = LocalOpts(',', false, '"', '\\', false, false)
202210
@test readcolnames(str1, opts, 1, String[]) == (["a", "b", "c d", "e"], 13)
203211
@test readcolnames("\n\r$str1", opts, 3, Dict(3=>"x")) == (["a", "b", "x", "e"], 15)
204212
#@test readcolnames("$str2", opts, 3, Dict(3=>"x")) == (["a", "b", "x", "d\" e"], 24)
@@ -265,7 +273,7 @@ import TextParse: guesscolparsers
265273
x y,1.0,1,
266274
,1.0,,1
267275
"""
268-
opts = LocalOpts(',', '"', '\\', false, false)
276+
opts = LocalOpts(',', false, '"', '\\', false, false)
269277
_, pos = readcolnames(str1, opts, 1, String[])
270278
testtill(i, colparsers=[]) = guesscolparsers(str1, String[], opts, pos, i, colparsers)
271279
@test testtill(0) |> first == Any[]
@@ -300,7 +308,7 @@ end
300308

301309
@testset "date parsing" begin
302310
tok = DateTimeToken(DateTime, dateformat"yyyy-mm-dd HH:MM:SS")
303-
opts = LocalOpts('y', '"', '\\', false, false)
311+
opts = LocalOpts('y', false, '"', '\\', false, false)
304312
str = "1970-02-02 02:20:20"
305313
@test tryparsenext(tok, str, 1, length(str), opts) |> unwrap == (DateTime("1970-02-02T02:20:20"), length(str)+1)
306314
@test tryparsenext(tok, str*"x", 1, length(str)+1, opts) |> unwrap == (DateTime("1970-02-02T02:20:20"), length(str)+1)
@@ -422,6 +430,13 @@ end
422430
1,1,1
423431
"""
424432
@test _csvread(str1, skiplines_begin=3) == (([1], [1], [1]), String["x", "y","z"])
433+
434+
s = """
435+
x,y z
436+
a,b 1
437+
e 3
438+
"""
439+
@test _csvread(s, spacedelim=true) == ((["a,b", "e"],[1,3]), ["x,y","z"])
425440
end
426441

427442
using PooledArrays

0 commit comments

Comments
 (0)