From 7797e0de9a0df9bf46588e031802b103e8660e92 Mon Sep 17 00:00:00 2001 From: Sudeep_Pauskar <150375908+7Lion10@users.noreply.github.com> Date: Tue, 15 Apr 2025 16:58:31 +0530 Subject: [PATCH 1/9] Update Directives.g4 --- .../antlr4/io/cdap/wrangler/parser/Directives.g4 | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/wrangler-core/src/main/antlr4/io/cdap/wrangler/parser/Directives.g4 b/wrangler-core/src/main/antlr4/io/cdap/wrangler/parser/Directives.g4 index 7c517ed6a..3e498e3c5 100644 --- a/wrangler-core/src/main/antlr4/io/cdap/wrangler/parser/Directives.g4 +++ b/wrangler-core/src/main/antlr4/io/cdap/wrangler/parser/Directives.g4 @@ -311,3 +311,17 @@ fragment Int fragment Digit : [0-9] ; + +1. Add Lexer Rules +// Byte size units (e.g., 10KB, 5MB) +BYTE_SIZE: DIGITS ('.' DIGITS)? BYTE_UNIT; + +// Time duration units (e.g., 100ms, 5s) +TIME_DURATION: DIGITS ('.' DIGITS)? TIME_UNIT; + +// Fragments +fragment BYTE_UNIT: ('B' | 'KB' | 'MB' | 'GB' | 'TB'); +fragment TIME_UNIT: ('ms' | 's' | 'm' | 'h'); +fragment DIGITS: [0-9]+; + + From cec20dc187719c810d905a0f820adb87e2a1c59a Mon Sep 17 00:00:00 2001 From: Sudeep_Pauskar <150375908+7Lion10@users.noreply.github.com> Date: Tue, 15 Apr 2025 17:01:57 +0530 Subject: [PATCH 2/9] Update Directives.g4 --- .../io/cdap/wrangler/parser/Directives.g4 | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/wrangler-core/src/main/antlr4/io/cdap/wrangler/parser/Directives.g4 b/wrangler-core/src/main/antlr4/io/cdap/wrangler/parser/Directives.g4 index 3e498e3c5..1776028e4 100644 --- a/wrangler-core/src/main/antlr4/io/cdap/wrangler/parser/Directives.g4 +++ b/wrangler-core/src/main/antlr4/io/cdap/wrangler/parser/Directives.g4 @@ -312,16 +312,23 @@ fragment Digit : [0-9] ; -1. Add Lexer Rules -// Byte size units (e.g., 10KB, 5MB) BYTE_SIZE: DIGITS ('.' DIGITS)? BYTE_UNIT; - -// Time duration units (e.g., 100ms, 5s) TIME_DURATION: DIGITS ('.' DIGITS)? TIME_UNIT; -// Fragments fragment BYTE_UNIT: ('B' | 'KB' | 'MB' | 'GB' | 'TB'); fragment TIME_UNIT: ('ms' | 's' | 'm' | 'h'); fragment DIGITS: [0-9]+; +byteSizeArg: BYTE_SIZE; +timeDurationArg: TIME_DURATION; + +value + : STRING + | NUMBER + | BOOLEAN + | BYTE_SIZE + | TIME_DURATION + ; + + From 48e8a17a733bb6dfa7be64091d4c1bf4d72c43b1 Mon Sep 17 00:00:00 2001 From: Sudeep_Pauskar <150375908+7Lion10@users.noreply.github.com> Date: Tue, 15 Apr 2025 17:05:22 +0530 Subject: [PATCH 3/9] Update UsageDefinitionTest.java --- .../api/parser/UsageDefinitionTest.java | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/wrangler-api/src/test/java/io/cdap/wrangler/api/parser/UsageDefinitionTest.java b/wrangler-api/src/test/java/io/cdap/wrangler/api/parser/UsageDefinitionTest.java index 8141d57bf..033cd8795 100644 --- a/wrangler-api/src/test/java/io/cdap/wrangler/api/parser/UsageDefinitionTest.java +++ b/wrangler-api/src/test/java/io/cdap/wrangler/api/parser/UsageDefinitionTest.java @@ -69,6 +69,40 @@ public void testUsageStringCreation() { usage = builder.build().toString(); Assert.assertEquals("set-columns :cols [,:cols ]*", usage); usages.add(usage); + public class ByteSize implements Token { + private final long bytes; + + public ByteSize(String input) { + input = input.toUpperCase().trim(); + if (input.endsWith("KB")) bytes = (long)(Double.parseDouble(input.replace("KB", "")) * 1024); + else if (input.endsWith("MB")) bytes = (long)(Double.parseDouble(input.replace("MB", "")) * 1024 * 1024); + else if (input.endsWith("GB")) bytes = (long)(Double.parseDouble(input.replace("GB", "")) * 1024 * 1024 * 1024); + else if (input.endsWith("TB")) bytes = (long)(Double.parseDouble(input.replace("TB", "")) * 1024L * 1024 * 1024 * 1024); + else if (input.endsWith("B")) bytes = Long.parseLong(input.replace("B", "")); + else throw new IllegalArgumentException("Invalid byte size format: " + input); + } + + public long getBytes() { + return bytes; + } +} +public class TimeDuration implements Token { + private final long millis; + + public TimeDuration(String input) { + input = input.trim(); + if (input.endsWith("ms")) millis = Long.parseLong(input.replace("ms", "")); + else if (input.endsWith("s")) millis = Long.parseLong(input.replace("s", "")) * 1000; + else if (input.endsWith("m")) millis = Long.parseLong(input.replace("m", "")) * 60 * 1000; + else if (input.endsWith("h")) millis = Long.parseLong(input.replace("h", "")) * 3600 * 1000; + else throw new IllegalArgumentException("Invalid time format: " + input); + } + + public long getMillis() { + return millis; + } +} + Assert.assertTrue(true); } From 46eabf58528af24a4aa84b8bf0c41dad548b1969 Mon Sep 17 00:00:00 2001 From: Sudeep_Pauskar <150375908+7Lion10@users.noreply.github.com> Date: Tue, 15 Apr 2025 17:06:45 +0530 Subject: [PATCH 4/9] Create RecipeVisitor.java --- wrangler-core/RecipeVisitor.java | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 wrangler-core/RecipeVisitor.java diff --git a/wrangler-core/RecipeVisitor.java b/wrangler-core/RecipeVisitor.java new file mode 100644 index 000000000..34bcf647b --- /dev/null +++ b/wrangler-core/RecipeVisitor.java @@ -0,0 +1,9 @@ +@Override +public Token visitByteSizeArg(DirectivesParser.ByteSizeArgContext ctx) { + return new ByteSize(ctx.getText()); +} + +@Override +public Token visitTimeDurationArg(DirectivesParser.TimeDurationArgContext ctx) { + return new TimeDuration(ctx.getText()); +} From e6c957742e434dfed5cb3a1089db78d9e0aec03a Mon Sep 17 00:00:00 2001 From: Sudeep_Pauskar <150375908+7Lion10@users.noreply.github.com> Date: Tue, 15 Apr 2025 17:14:03 +0530 Subject: [PATCH 5/9] Create ByteSize.java --- .../io/cdap/wrangler/api/parser/ByteSize.java | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 wrangler-api/src/test/java/io/cdap/wrangler/api/parser/ByteSize.java diff --git a/wrangler-api/src/test/java/io/cdap/wrangler/api/parser/ByteSize.java b/wrangler-api/src/test/java/io/cdap/wrangler/api/parser/ByteSize.java new file mode 100644 index 000000000..87dbb0346 --- /dev/null +++ b/wrangler-api/src/test/java/io/cdap/wrangler/api/parser/ByteSize.java @@ -0,0 +1,24 @@ +package io.cdap.wrangler.api.parser; + +public class ByteSize implements Token { + private final long bytes; + + public ByteSize(String input) { + input = input.toUpperCase().trim(); + if (input.endsWith("KB")) bytes = (long)(Double.parseDouble(input.replace("KB", "")) * 1024); + else if (input.endsWith("MB")) bytes = (long)(Double.parseDouble(input.replace("MB", "")) * 1024 * 1024); + else if (input.endsWith("GB")) bytes = (long)(Double.parseDouble(input.replace("GB", "")) * 1024 * 1024 * 1024); + else if (input.endsWith("TB")) bytes = (long)(Double.parseDouble(input.replace("TB", "")) * 1024L * 1024 * 1024 * 1024); + else if (input.endsWith("B")) bytes = Long.parseLong(input.replace("B", "")); + else throw new IllegalArgumentException("Invalid byte size format: " + input); + } + + public long getBytes() { + return bytes; + } + + @Override + public String toString() { + return bytes + "B"; + } +} From 56f467674a0464fc7f0744f447691a64793fb10b Mon Sep 17 00:00:00 2001 From: Sudeep_Pauskar <150375908+7Lion10@users.noreply.github.com> Date: Tue, 15 Apr 2025 17:14:31 +0530 Subject: [PATCH 6/9] Create TimeDuration.java --- .../wrangler/api/parser/TimeDuration.java | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 wrangler-api/src/test/java/io/cdap/wrangler/api/parser/TimeDuration.java diff --git a/wrangler-api/src/test/java/io/cdap/wrangler/api/parser/TimeDuration.java b/wrangler-api/src/test/java/io/cdap/wrangler/api/parser/TimeDuration.java new file mode 100644 index 000000000..92565cb86 --- /dev/null +++ b/wrangler-api/src/test/java/io/cdap/wrangler/api/parser/TimeDuration.java @@ -0,0 +1,23 @@ +package io.cdap.wrangler.api.parser; + +public class TimeDuration implements Token { + private final long millis; + + public TimeDuration(String input) { + input = input.trim(); + if (input.endsWith("ms")) millis = Long.parseLong(input.replace("ms", "")); + else if (input.endsWith("s")) millis = Long.parseLong(input.replace("s", "")) * 1000; + else if (input.endsWith("m")) millis = Long.parseLong(input.replace("m", "")) * 60 * 1000; + else if (input.endsWith("h")) millis = Long.parseLong(input.replace("h", "")) * 3600 * 1000; + else throw new IllegalArgumentException("Invalid time duration format: " + input); + } + + public long getMillis() { + return millis; + } + + @Override + public String toString() { + return millis + "ms"; + } +} From fad4aae56d6de0d26fbdda0de69a8cdf9b0a040a Mon Sep 17 00:00:00 2001 From: Sudeep_Pauskar <150375908+7Lion10@users.noreply.github.com> Date: Tue, 15 Apr 2025 17:19:11 +0530 Subject: [PATCH 7/9] Create CustomDirectiveVisitor.java --- .../parser/CustomDirectiveVisitor.java | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 wrangler-core/src/main/java/io/cdap/wrangler/parser/CustomDirectiveVisitor.java diff --git a/wrangler-core/src/main/java/io/cdap/wrangler/parser/CustomDirectiveVisitor.java b/wrangler-core/src/main/java/io/cdap/wrangler/parser/CustomDirectiveVisitor.java new file mode 100644 index 000000000..c54b31994 --- /dev/null +++ b/wrangler-core/src/main/java/io/cdap/wrangler/parser/CustomDirectiveVisitor.java @@ -0,0 +1,28 @@ +package io.cdap.wrangler.parser; + +import io.cdap.wrangler.api.parser.ByteSize; +import io.cdap.wrangler.api.parser.TimeDuration; +import io.cdap.wrangler.api.parser.Token; +import io.cdap.wrangler.grammar.DirectivesBaseVisitor; +import io.cdap.wrangler.grammar.DirectivesParser; + +public class CustomDirectiveVisitor extends DirectivesBaseVisitor { + + @Override + public Token visitByteSizeArg(DirectivesParser.ByteSizeArgContext ctx) { + return new ByteSize(ctx.getText()); + } + + @Override + public Token visitTimeDurationArg(DirectivesParser.TimeDurationArgContext ctx) { + return new TimeDuration(ctx.getText()); + } + + @Override + public Token visitValue(DirectivesParser.ValueContext ctx) { + if (ctx.BYTE_SIZE() != null) return new ByteSize(ctx.getText()); + if (ctx.TIME_DURATION() != null) return new TimeDuration(ctx.getText()); + // return other existing tokens (string, boolean, number) + return super.visitValue(ctx); + } +} From c1abb438c24e1769ad6e5f4ecc517b91dbb61882 Mon Sep 17 00:00:00 2001 From: Sudeep_Pauskar <150375908+7Lion10@users.noreply.github.com> Date: Tue, 15 Apr 2025 17:23:12 +0530 Subject: [PATCH 8/9] Create AggregateStatsTest.java --- .../io/cdap/wrangler/AggregateStatsTest.java | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 wrangler-core/src/test/java/io/cdap/wrangler/AggregateStatsTest.java diff --git a/wrangler-core/src/test/java/io/cdap/wrangler/AggregateStatsTest.java b/wrangler-core/src/test/java/io/cdap/wrangler/AggregateStatsTest.java new file mode 100644 index 000000000..9060d3890 --- /dev/null +++ b/wrangler-core/src/test/java/io/cdap/wrangler/AggregateStatsTest.java @@ -0,0 +1,21 @@ +public class AggregateStatsTest { + @Test + public void testAggregateStats() throws Exception { + List rows = Arrays.asList( + new Row("data_transfer", "10KB").add("response_time", "2s"), + new Row("data_transfer", "1.5MB").add("response_time", "3s") + ); + + String[] recipe = { + "aggregate-stats :data_transfer :response_time total_size_mb total_time_sec" + }; + + List results = TestingRig.execute(recipe, rows); + + double expectedMB = (10240 + 1572864) / (1024.0 * 1024); + double expectedSec = (2000 + 3000) / 1000.0; + + Assert.assertEquals(expectedMB, results.get(0).getValue("total_size_mb"), 0.01); + Assert.assertEquals(expectedSec, results.get(0).getValue("total_time_sec"), 0.01); + } +} From 7a0b8513926f3f8078e3782332c888f744c8ac0a Mon Sep 17 00:00:00 2001 From: Sudeep_Pauskar <150375908+7Lion10@users.noreply.github.com> Date: Tue, 15 Apr 2025 17:24:02 +0530 Subject: [PATCH 9/9] Update README.md --- README.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/README.md b/README.md index 4aa6eeb3a..ad3fbeaaf 100644 --- a/README.md +++ b/README.md @@ -216,3 +216,23 @@ Cask is a trademark of Cask Data, Inc. All rights reserved. Apache, Apache HBase, and HBase are trademarks of The Apache Software Foundation. Used with permission. No endorsement by The Apache Software Foundation is implied by the use of these marks. +## 🆕 New Parsers: Byte Size & Time Duration + +Wrangler now supports parsing: + +- Byte sizes: `10KB`, `1.5MB`, `2GB` +- Time durations: `500ms`, `3s`, `2h` + +### 🔧 Usage + +```text +aggregate-stats :data_transfer :response_time total_size_mb total_time_sec + +--- + +## ✅ Step 10: Commit & Push Your Changes + +```bash +git add . +git commit -m "Add ByteSize, TimeDuration parsers and AggregateStats directive" +git push origin feature/byte-time-parsers