clflushopt
diff --git a/‎glint/src/main/java/co/clflushopt/glint/datasource/CsvDataSource.java‎
Lines changed: 46 additions & 8 deletions b/‎glint/src/main/java/co/clflushopt/glint/datasource/CsvDataSource.java‎
Lines changed: 46 additions & 8 deletions
diff --git a/‎glint/src/main/java/co/clflushopt/glint/datasource/CsvReaderIterable.java‎
Lines changed: 26 additions & 0 deletions b/‎glint/src/main/java/co/clflushopt/glint/datasource/CsvReaderIterable.java‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎glint/src/main/java/co/clflushopt/glint/datasource/CsvReaderIterator.java‎
Lines changed: 172 additions & 0 deletions b/‎glint/src/main/java/co/clflushopt/glint/datasource/CsvReaderIterator.java‎
Lines changed: 172 additions & 0 deletions
diff --git a/‎glint/src/main/java/co/clflushopt/glint/datasource/DataSource.java‎
Lines changed: 1 addition & 1 deletion b/‎glint/src/main/java/co/clflushopt/glint/datasource/DataSource.java‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎glint/src/main/java/co/clflushopt/glint/types/ArrowFieldVector.java‎
Lines changed: 83 additions & 0 deletions b/‎glint/src/main/java/co/clflushopt/glint/types/ArrowFieldVector.java‎
Lines changed: 83 additions & 0 deletions
@@ -2,6 +2,7 @@
 
 import java.io.File;
 import java.io.FileNotFoundException;
+import java.util.ArrayList;
 import java.util.List;
 import java.util.Optional;
 import java.util.logging.Logger;
@@ -12,6 +13,7 @@
 
 import co.clflushopt.glint.types.ArrowTypes;
 import co.clflushopt.glint.types.Field;
+import co.clflushopt.glint.types.RecordBatch;
 import co.clflushopt.glint.types.Schema;
 
 /**
@@ -23,7 +25,7 @@
  * names are inferred from the header, if a header is not present then the field
  * names will be `field_0...field_n`.
  */
-public class CsvDataSource {
+public class CsvDataSource implements DataSource {
     private final Schema schema;
     private final String filename;
     private final Boolean hasHeaders;
@@ -45,14 +47,19 @@ public CsvDataSource(String filename, Optional<Schema> schema, Boolean hasHeader
         }
     }
 
-    private Schema inferSchema() throws FileNotFoundException {
-        logger.info("Schema inference triggered");
-
+    private File open() throws FileNotFoundException {
         var file = new File(filename);
         if (!file.exists()) {
             logger.info("File was not found");
             throw new FileNotFoundException("file with name " + filename + " was not found");
         }
+        return file;
+    }
+
+    private Schema inferSchema() throws FileNotFoundException {
+        logger.info("Schema inference triggered");
+
+        var file = open();
 
         var parser = getCsvParser(getCsvDefaultSettings());
         parser.beginParsing(file);
@@ -70,11 +77,14 @@ private Schema inferSchema() throws FileNotFoundException {
         parser.stopParsing();
 
         if (hasHeaders) {
-            return new Schema(List.of(headers).stream()
-                    .map(columnName -> new Field(columnName, ArrowTypes.StringType)).toList());
+            return new Schema(Streams.mapWithIndex(List.of(headers).stream(), (columnName,
+                    columnIndex) -> new Field(columnName, (int) columnIndex, ArrowTypes.StringType))
+                    .toList());
+
         } else {
-            return new Schema(Streams.mapWithIndex(List.of(headers).stream(), (_field,
-                    index) -> new Field(String.format("field_%d", index), ArrowTypes.StringType))
+            return new Schema(Streams.mapWithIndex(List.of(headers).stream(),
+                    (_field, index) -> new Field(String.format("field_%d", index), (int) index,
+                            ArrowTypes.StringType))
                     .toList());
         }
 
@@ -115,4 +125,32 @@ public Logger getLogger() {
         return logger;
     }
 
+    @Override
+    public Iterable<RecordBatch> scan(List<String> projection) {
+        try {
+            var file = this.open();
+            var schema = this.schema;
+            var settings = this.getCsvDefaultSettings();
+
+            if (!projection.isEmpty()) {
+                schema = this.schema.select(projection);
+                settings.selectFields(projection.toArray(new String[0]));
+            }
+            settings.setHeaderExtractionEnabled(hasHeaders);
+            if (!hasHeaders) {
+                settings.setHeaders(schema.getFields().stream().map(field -> field.name()).toList()
+                        .toArray(new String[0]));
+            }
+
+            var parser = getCsvParser(settings);
+            parser.beginParsing(file);
+            var format = parser.getDetectedFormat();
+            logger.info(String.format("Detected format with delimiter: %s and line separator: %s",
+                    format.getDelimiterString(), format.getLineSeparator()));
+
+            return new CsvReaderIterable(schema, parser, this.batchSize);
+        } catch (Exception e) {
+            return new ArrayList<>();
+        }
+    }
 }
@@ -0,0 +1,26 @@
+package co.clflushopt.glint.datasource;
+
+import java.util.Iterator;
+
+import com.univocity.parsers.csv.CsvParser;
+
+import co.clflushopt.glint.types.RecordBatch;
+import co.clflushopt.glint.types.Schema;
+
+public class CsvReaderIterable implements Iterable<RecordBatch> {
+    private final Schema schema;
+    private final CsvParser parser;
+    private final int batchSize;
+
+    public CsvReaderIterable(Schema schema, CsvParser parser, int batchSize) {
+        this.schema = schema;
+        this.parser = parser;
+        this.batchSize = batchSize;
+    }
+
+    @Override
+    public Iterator<RecordBatch> iterator() {
+        return new CsvReaderIterator(schema, parser, batchSize);
+    }
+
+}
@@ -0,0 +1,172 @@
+package co.clflushopt.glint.datasource;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+import java.util.logging.Logger;
+import java.util.stream.Collectors;
+
+import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.BigIntVector;
+import org.apache.arrow.vector.Float4Vector;
+import org.apache.arrow.vector.Float8Vector;
+import org.apache.arrow.vector.IntVector;
+import org.apache.arrow.vector.SmallIntVector;
+import org.apache.arrow.vector.TinyIntVector;
+import org.apache.arrow.vector.ValueVector;
+import org.apache.arrow.vector.VarCharVector;
+import org.apache.arrow.vector.VectorSchemaRoot;
+
+import com.univocity.parsers.common.record.Record;
+import com.univocity.parsers.csv.CsvParser;
+
+import co.clflushopt.glint.types.ArrowFieldVector;
+import co.clflushopt.glint.types.RecordBatch;
+import co.clflushopt.glint.types.Schema;
+import co.clflushopt.glint.util.IndexedStream;
+
+public class CsvReaderIterator implements Iterator<RecordBatch> {
+    private static final Logger logger = Logger.getLogger(CsvDataSource.class.getSimpleName());
+
+    private final Schema schema;
+    private final CsvParser parser;
+    private final int batchSize;
+    private RecordBatch next;
+    private boolean started;
+
+    public CsvReaderIterator(Schema schema, CsvParser parser, int batchSize) {
+        this.schema = schema;
+        this.parser = parser;
+        this.batchSize = batchSize;
+        this.started = false;
+    }
+
+    @Override
+    public boolean hasNext() {
+        if (!started) {
+            started = true;
+            next = nextBatch();
+        }
+        return next != null;
+    }
+
+    @Override
+    public RecordBatch next() {
+        if (!started) {
+            hasNext();
+        }
+
+        RecordBatch out = next;
+        next = nextBatch();
+
+        if (out == null) {
+            throw new NoSuchElementException(
+                    "Cannot read past the end of " + CsvReaderIterator.class.getSimpleName());
+        }
+
+        return out;
+    }
+
+    private RecordBatch nextBatch() {
+        ArrayList<Record> rows = new ArrayList<>(batchSize);
+
+        Record line;
+        do {
+            line = parser.parseNextRecord();
+            if (line != null) {
+                rows.add(line);
+            }
+        } while (line != null && rows.size() < batchSize);
+
+        if (rows.isEmpty()) {
+            return null;
+        }
+
+        return createBatch(rows);
+    }
+
+    private RecordBatch createBatch(ArrayList<Record> rows) {
+        VectorSchemaRoot root = VectorSchemaRoot.create(schema.toArrow(),
+                new RootAllocator(Long.MAX_VALUE));
+        root.getFieldVectors().forEach(v -> v.setInitialCapacity(rows.size()));
+        root.allocateNew();
+
+        IndexedStream.withIndex(root.getFieldVectors()).forEach(field -> {
+            ValueVector vector = field.getValue();
+            if (vector instanceof VarCharVector) {
+                VarCharVector varCharVector = (VarCharVector) vector;
+                IndexedStream.withIndex(rows).forEach(row -> {
+                    String valueStr = row.getValue().getValue(vector.getName(), "").trim();
+                    varCharVector.setSafe(row.getIndex(), valueStr.getBytes());
+                });
+            } else if (vector instanceof TinyIntVector) {
+                TinyIntVector tinyIntVector = (TinyIntVector) vector;
+                IndexedStream.withIndex(rows).forEach(row -> {
+                    String valueStr = row.getValue().getValue(vector.getName(), "").trim();
+                    if (valueStr.isEmpty()) {
+                        tinyIntVector.setNull(row.getIndex());
+                    } else {
+                        tinyIntVector.set(row.getIndex(), Byte.parseByte(valueStr));
+                    }
+                });
+            } else if (vector instanceof SmallIntVector) {
+                SmallIntVector smallIntVector = (SmallIntVector) vector;
+                IndexedStream.withIndex(rows).forEach(row -> {
+                    String valueStr = row.getValue().getValue(vector.getName(), "").trim();
+                    if (valueStr.isEmpty()) {
+                        smallIntVector.setNull(row.getIndex());
+                    } else {
+                        smallIntVector.set(row.getIndex(), Short.parseShort(valueStr));
+                    }
+                });
+            } else if (vector instanceof IntVector) {
+                IntVector intVector = (IntVector) vector;
+                IndexedStream.withIndex(rows).forEach(row -> {
+                    String valueStr = row.getValue().getValue(vector.getName(), "").trim();
+                    if (valueStr.isEmpty()) {
+                        intVector.setNull(row.getIndex());
+                    } else {
+                        intVector.set(row.getIndex(), Integer.parseInt(valueStr));
+                    }
+                });
+            } else if (vector instanceof BigIntVector) {
+                BigIntVector bigIntVector = (BigIntVector) vector;
+                IndexedStream.withIndex(rows).forEach(row -> {
+                    String valueStr = row.getValue().getValue(vector.getName(), "").trim();
+                    if (valueStr.isEmpty()) {
+                        bigIntVector.setNull(row.getIndex());
+                    } else {
+                        bigIntVector.set(row.getIndex(), Long.parseLong(valueStr));
+                    }
+                });
+            } else if (vector instanceof Float4Vector) {
+                Float4Vector float4Vector = (Float4Vector) vector;
+                IndexedStream.withIndex(rows).forEach(row -> {
+                    String valueStr = row.getValue().getValue(vector.getName(), "").trim();
+                    if (valueStr.isEmpty()) {
+                        float4Vector.setNull(row.getIndex());
+                    } else {
+                        float4Vector.set(row.getIndex(), Float.parseFloat(valueStr));
+                    }
+                });
+            } else if (vector instanceof Float8Vector) {
+                Float8Vector float8Vector = (Float8Vector) vector;
+                IndexedStream.withIndex(rows).forEach(row -> {
+                    String valueStr = row.getValue().getValue(vector.getName(), "").trim();
+                    if (valueStr.isEmpty()) {
+                        float8Vector.setNull(row.getIndex());
+                    } else {
+                        float8Vector.set(row.getIndex(), Double.parseDouble(valueStr));
+                    }
+                });
+            } else {
+                throw new IllegalStateException(
+                        "No support for reading CSV columns with data type " + vector);
+            }
+            vector.setValueCount(rows.size());
+        });
+
+        return new RecordBatch(schema, root.getFieldVectors().stream().map(ArrowFieldVector::new)
+                .collect(Collectors.toList()));
+    }
+}
@@ -30,5 +30,5 @@ public interface DataSource {
      * @param projection
      * @return `RecordBatch` with only requested projections.
      */
-    public List<RecordBatch> scan(List<String> projection);
+    public Iterable<RecordBatch> scan(List<String> projection);
 }
@@ -0,0 +1,83 @@
+package co.clflushopt.glint.types;
+
+import org.apache.arrow.vector.BigIntVector;
+import org.apache.arrow.vector.BitVector;
+import org.apache.arrow.vector.FieldVector;
+import org.apache.arrow.vector.Float4Vector;
+import org.apache.arrow.vector.Float8Vector;
+import org.apache.arrow.vector.IntVector;
+import org.apache.arrow.vector.SmallIntVector;
+import org.apache.arrow.vector.TinyIntVector;
+import org.apache.arrow.vector.VarCharVector;
+import org.apache.arrow.vector.types.pojo.ArrowType;
+
+/**
+ * Wrapper around Arrow FieldVector
+ */
+public class ArrowFieldVector implements ColumnVector {
+    private final FieldVector field;
+
+    public ArrowFieldVector(FieldVector field) {
+        this.field = field;
+    }
+
+    @Override
+    public ArrowType getType() {
+        if (field instanceof BitVector) {
+            return ArrowTypes.BooleanType;
+        } else if (field instanceof TinyIntVector) {
+            return ArrowTypes.Int8Type;
+        } else if (field instanceof SmallIntVector) {
+            return ArrowTypes.Int16Type;
+        } else if (field instanceof IntVector) {
+            return ArrowTypes.Int32Type;
+        } else if (field instanceof BigIntVector) {
+            return ArrowTypes.Int64Type;
+        } else if (field instanceof Float4Vector) {
+            return ArrowTypes.FloatType;
+        } else if (field instanceof Float8Vector) {
+            return ArrowTypes.DoubleType;
+        } else if (field instanceof VarCharVector) {
+            return ArrowTypes.StringType;
+        } else {
+            throw new IllegalStateException("Unsupported field vector type: " + field.getClass());
+        }
+    }
+
+    @Override
+    public Object getValue(int i) {
+        if (field.isNull(i)) {
+            return null;
+        }
+
+        if (field instanceof BitVector) {
+            return ((BitVector) field).get(i) == 1;
+        } else if (field instanceof TinyIntVector) {
+            return ((TinyIntVector) field).get(i);
+        } else if (field instanceof SmallIntVector) {
+            return ((SmallIntVector) field).get(i);
+        } else if (field instanceof IntVector) {
+            return ((IntVector) field).get(i);
+        } else if (field instanceof BigIntVector) {
+            return ((BigIntVector) field).get(i);
+        } else if (field instanceof Float4Vector) {
+            return ((Float4Vector) field).get(i);
+        } else if (field instanceof Float8Vector) {
+            return ((Float8Vector) field).get(i);
+        } else if (field instanceof VarCharVector) {
+            byte[] bytes = ((VarCharVector) field).get(i);
+            return bytes == null ? null : new String(bytes);
+        } else {
+            throw new IllegalStateException("Unsupported field vector type: " + field.getClass());
+        }
+    }
+
+    @Override
+    public int getSize() {
+        return field.getValueCount();
+    }
+
+    public FieldVector getField() {
+        return field;
+    }
+}
Original file line number	Diff line number	Diff line change
`@@ -30,5 +30,5 @@ public interface DataSource {`
`30`	`30`	`* @param projection`
`31`	`31`	* @return `RecordBatch` with only requested projections.
`32`	`32`	`*/`
`33`		`- public List<RecordBatch> scan(List<String> projection);`
	`33`	`+ public Iterable<RecordBatch> scan(List<String> projection);`
`34`	`34`	`}`