Skip to content

Invalid boolean values #15

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 61 additions & 55 deletions src/ApacheOrcDotNet.WriterTest.App/Program.cs
Original file line number Diff line number Diff line change
@@ -1,68 +1,74 @@
using System;
using ApacheOrcDotNet.FluentSerialization;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Threading.Tasks;
using ApacheOrcDotNet.FluentSerialization;

namespace ApacheOrcDotNet.Test.App
{
public class Program
{
public static void Main(string[] args)
{
var baseTime = new DateTime(2017, 3, 16, 0, 0, 0, DateTimeKind.Utc);
var rand = new Random(123);
var testElements = new List<TestClass>();
for (int i = 0; i < 80000; i++)
{
var random = rand.Next();
var set = i / 10000;
var randomInRange = (random % 10000) + set * 10000 - 40000;
var dec = (DateTime.Now - DateTime.Today).Ticks / (decimal)TimeSpan.TicksPerSecond;
var timestamp = baseTime.AddTicks(random);
var element = new TestClass
{
Random = random,
RandomInRange = randomInRange,
Incrementing = i,
SetNumber = set,
Double = (double)i / (set + 1),
Float = (float)i / (set + 1),
Dec = dec,
Timestamp = timestamp,
Str = $"Random={random}, RandomInRange={randomInRange}, Incrementing={i}, SetNumber={set}, Dec={dec}, Timestamp={timestamp:MM/dd/yyyy hh:mm:ss.fffffff}",
DictionaryStr = $"SetNumber={set}"
};
testElements.Add(element);
}
var baseTime = new DateTime(2017, 3, 16, 0, 0, 0, DateTimeKind.Utc);
var rand = new Random(123);
var testElements = new List<TestClass>();
var boolToggler = false;
for (int i = 0; i < 1000; i++)
{
var random = rand.Next();
var set = i / 10000;
var randomInRange = (random % 10000) + set * 10000 - 40000;
var dec = (DateTime.Now - DateTime.Today).Ticks / (decimal)TimeSpan.TicksPerSecond;
var timestamp = baseTime.AddTicks(random);
var element = new TestClass
{
Random = random,
RandomInRange = randomInRange,
Incrementing = i,
SetNumber = set,
Double = (double)i / (set + 1),
Float = (float)i / (set + 1),
Dec = dec,
Timestamp = timestamp,
Str = $"Random={random}, RandomInRange={randomInRange}, Incrementing={i}, SetNumber={set}, Dec={dec}, Timestamp={timestamp:MM/dd/yyyy hh:mm:ss.fffffff}",
DictionaryStr = $"SetNumber={set}",
Boolean = boolToggler,
NullBooleans = boolToggler ? null : true,
};

var serializationConfiguration = new SerializationConfiguration()
.ConfigureType<TestClass>()
.ConfigureProperty(x => x.Dec, x => { x.DecimalPrecision = 14; x.DecimalScale = 9; })
.Build();
boolToggler = !boolToggler;

using (var fileStream = new FileStream("test.orc", FileMode.Create, FileAccess.Write))
using (var writer = new OrcWriter<TestClass>(fileStream, new WriterConfiguration(), serializationConfiguration)) //Use the default configuration
{
writer.AddRows(testElements);
}
}
}
testElements.Add(element);
}

class TestClass
{
public int Random { get; set; }
public int RandomInRange { get; set; }
public int Incrementing { get; set; }
public int SetNumber { get; set; }
public int? AllNulls { get; set; }
public double Double { get; set; }
public float Float { get; set; }
public decimal Dec { get; set; }
public decimal? AllNullsDec { get; set; }
public DateTime Timestamp { get; set; }
public string Str { get; set; }
public string DictionaryStr { get; set; }
}
var serializationConfiguration = new SerializationConfiguration()
.ConfigureType<TestClass>()
.ConfigureProperty(x => x.Dec, x => { x.DecimalPrecision = 14; x.DecimalScale = 9; })
.Build();

using (var fileStream = new FileStream("test.orc", FileMode.Create, FileAccess.Write))
using (var writer = new OrcWriter<TestClass>(fileStream, new WriterConfiguration() { RowIndexStride = 10 }, serializationConfiguration)) //Use the default configuration
{
writer.AddRows(testElements);
}
}
}

class TestClass
{
public int Random { get; set; }
public int RandomInRange { get; set; }
public int Incrementing { get; set; }
public int SetNumber { get; set; }
public int? AllNulls { get; set; }
public double Double { get; set; }
public float Float { get; set; }
public decimal Dec { get; set; }
public decimal? AllNullsDec { get; set; }
public DateTime Timestamp { get; set; }
public string Str { get; set; }
public string DictionaryStr { get; set; }
public bool Boolean { get; set; }
public bool? NullBooleans { get; set; }
}
}
143 changes: 75 additions & 68 deletions src/ApacheOrcDotNet/ColumnTypes/BooleanWriter.cs
Original file line number Diff line number Diff line change
@@ -1,91 +1,98 @@
using ApacheOrcDotNet.Compression;
using ApacheOrcDotNet.Encodings;
using ApacheOrcDotNet.Protocol;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;

namespace ApacheOrcDotNet.ColumnTypes
{
public class BooleanWriter : IColumnWriter<bool?>
{
readonly bool _isNullable;
readonly OrcCompressedBuffer _presentBuffer;
readonly OrcCompressedBuffer _dataBuffer;
readonly bool _isNullable;
readonly OrcCompressedBuffer _presentBuffer;
readonly OrcCompressedBuffer _dataBuffer;
private readonly ContinuousBitWriter _dataWriter;
private readonly ContinuousBitWriter _presentWriter;

public BooleanWriter(bool isNullable, OrcCompressedBufferFactory bufferFactory, uint columnId)
{
_isNullable = isNullable;
ColumnId = columnId;
public BooleanWriter(bool isNullable, OrcCompressedBufferFactory bufferFactory, uint columnId)
{
_isNullable = isNullable;
ColumnId = columnId;

if (_isNullable)
{
_presentBuffer = bufferFactory.CreateBuffer(StreamKind.Present);
_presentBuffer.MustBeIncluded = false;
}
_dataBuffer = bufferFactory.CreateBuffer(StreamKind.Data);
}
if (_isNullable)
{
_presentBuffer = bufferFactory.CreateBuffer(StreamKind.Present);
_presentBuffer.MustBeIncluded = false;
_presentWriter = new ContinuousBitWriter(_presentBuffer);
}
_dataBuffer = bufferFactory.CreateBuffer(StreamKind.Data);
_dataWriter = new ContinuousBitWriter(_dataBuffer);

}

public List<IStatistics> Statistics { get; } = new List<IStatistics>();
public long CompressedLength => Buffers.Sum(s => s.Length);
public uint ColumnId { get; }
public OrcCompressedBuffer[] Buffers => _isNullable ? new[] { _presentBuffer, _dataBuffer } : new[] { _dataBuffer };
public ColumnEncodingKind ColumnEncoding => ColumnEncodingKind.Direct;
public List<IStatistics> Statistics { get; } = new List<IStatistics>();
public long CompressedLength => Buffers.Sum(s => s.Length);
public uint ColumnId { get; }
public OrcCompressedBuffer[] Buffers => _isNullable ? new[] { _presentBuffer, _dataBuffer } : new[] { _dataBuffer };
public ColumnEncodingKind ColumnEncoding => ColumnEncodingKind.Direct;

public void FlushBuffers()
{
if (_isNullable)
{
_presentWriter.Flush();
}

public void FlushBuffers()
{
foreach (var buffer in Buffers)
buffer.Flush();
}
_dataWriter.Flush();

public void Reset()
{
foreach (var buffer in Buffers)
buffer.Reset();
if(_isNullable)
_presentBuffer.MustBeIncluded = false;
Statistics.Clear();
}
foreach (var buffer in Buffers)
buffer.Flush();
}

public void AddBlock(IList<bool?> values)
{
var stats = new BooleanWriterStatistics();
Statistics.Add(stats);
public void Reset()
{
foreach (var buffer in Buffers)
buffer.Reset();
if (_isNullable)
_presentBuffer.MustBeIncluded = false;
Statistics.Clear();
}

public void AddBlock(IList<bool?> values)
{
var stats = new BooleanWriterStatistics();
Statistics.Add(stats);
if (_isNullable)
_presentBuffer.AnnotatePosition(stats, rleValuesToConsume: 0, bitsToConsume: 0);
_dataBuffer.AnnotatePosition(stats, rleValuesToConsume: 0, bitsToConsume: 0);

var valList = new List<bool>(values.Count);
var valList = new List<bool>(values.Count);

if(_isNullable)
{
var presentList = new List<bool>(values.Count);
if (_isNullable)
{
var presentList = new List<bool>(values.Count);

foreach(var value in values)
{
stats.AddValue(value);
if (value.HasValue)
valList.Add(value.Value);
presentList.Add(value.HasValue);
}
foreach (var value in values)
{
stats.AddValue(value);
if (value.HasValue)
valList.Add(value.Value);
presentList.Add(value.HasValue);
}

var presentEncoder = new BitWriter(_presentBuffer);
presentEncoder.Write(presentList);
if (stats.HasNull)
_presentBuffer.MustBeIncluded = true;
}
else
{
foreach(var value in values)
{
stats.AddValue(value);
valList.Add(value.Value);
}
}
_presentWriter.Write(presentList);
if (stats.HasNull)
_presentBuffer.MustBeIncluded = true;
}
else
{
foreach (var value in values)
{
stats.AddValue(value);
valList.Add(value.Value);
}
}

var valEncoder = new BitWriter(_dataBuffer);
valEncoder.Write(valList);
}
}
}
_dataWriter.Write(valList);
}
}
}
54 changes: 54 additions & 0 deletions src/ApacheOrcDotNet/ColumnTypes/ContinuousBitWriter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
using ApacheOrcDotNet.Encodings;
using System.Collections.Generic;
using System.IO;

namespace ApacheOrcDotNet.ColumnTypes
{
public class ContinuousBitWriter
{
readonly ByteRunLengthEncodingWriter _byteWriter;
private byte byteBuffer;
private int bitIndex;
private bool hasData;

public ContinuousBitWriter(Stream outputStream)
{
_byteWriter = new ByteRunLengthEncodingWriter(outputStream);
Flush();
}

public void Write(IList<bool> values)
{
foreach (var value in values)
{
Write(value);
}
}

public void Write(bool value)
{
if (value)
byteBuffer |= (byte)(1 << bitIndex);

hasData = true;
bitIndex--;

if (bitIndex == -1)
{
Flush();
}
}

public void Flush()
{
if (hasData)
{
_byteWriter.Write(new[] { byteBuffer });
}

byteBuffer = 0;
bitIndex = 7;
hasData = false;
}
}
}
11 changes: 8 additions & 3 deletions test/ApacheOrcDotNet.Test/ColumnTypes/BooleanColumn_Test.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,15 @@ public class BooleanColumn_Test
[Fact]
public void RoundTrip_BooleanColumn()
{
RoundTripSingleBool(70000);
// Default case
RoundTripSingleBool(70000);

// Problematic cases
RoundTripSingleBool(70000, 1000);
RoundTripSingleBool(70000, 10);
}

void RoundTripSingleBool(int numValues)
void RoundTripSingleBool(int numValues, int rowIndexStride = 10000)
{
var pocos = new List<SingleBoolPoco>();
var random = new Random(123);
Expand All @@ -26,7 +31,7 @@ void RoundTripSingleBool(int numValues)

var stream = new MemoryStream();
Footer footer;
StripeStreamHelper.Write(stream, pocos, out footer);
StripeStreamHelper.Write(stream, pocos, out footer, rowIndexStride: rowIndexStride);
var stripeStreams = StripeStreamHelper.GetStripeStreams(stream, footer);
var boolReader = new BooleanReader(stripeStreams, 1);
var results = boolReader.Read().ToArray();
Expand Down
4 changes: 2 additions & 2 deletions test/ApacheOrcDotNet.Test/ColumnTypes/StripeStreamHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ namespace ApacheOrcDotNet.Test.ColumnTypes
{
public static class StripeStreamHelper
{
public static void Write<T>(System.IO.Stream outputStream, IEnumerable<T> values, out Footer footer, SerializationConfiguration serializationConfiguration = null) where T : class
public static void Write<T>(System.IO.Stream outputStream, IEnumerable<T> values, out Footer footer, SerializationConfiguration serializationConfiguration = null, int rowIndexStride = 10000) where T : class
{
var bufferFactory = new OrcCompressedBufferFactory(256 * 1024, CompressionKind.Zlib, CompressionStrategy.Size);
var stripeWriter = new StripeWriter(typeof(T), outputStream, false, 0.8, 18,6, bufferFactory, 10000, 512 * 1024 * 1024, serializationConfiguration);
var stripeWriter = new StripeWriter(typeof(T), outputStream, false, 0.8, 18,6, bufferFactory, rowIndexStride, 512 * 1024 * 1024, serializationConfiguration);
stripeWriter.AddRows(values);
stripeWriter.RowAddingCompleted();
footer = stripeWriter.GetFooter();
Expand Down