Support array properties in CSVs. (#37)

jeffreylovitz · web-flow · commit 85c2397e3be2 · 2020-10-05T20:09:37.000+03:00
* Support array properties in CSVs.

* Address PR comments
diff --git a/README.md b/README.md
@@ -87,6 +87,7 @@ The flags for `max-token-count`, `max-buffer-size`, and `max-token-size` are typ
     - `integer`: an unquoted value that can be read as an integer type.
     - `double`: an unquoted value that can be read as a floating-point type.
     - `string`: any field that is either quote-interpolated or cannot be casted to a numeric or boolean type.
+    - `array`: A bracket-interpolated array of elements of any types. Strings within the array must be explicitly quote-interpolated. Array properties require use of a non-comma delimiter for the CSV (`-o`).
 - Cypher does not allow NULL values to be assigned to properties.
 - The default behaviour is to infer the property type, attempting to cast it to integer, float, boolean, or string in that order.
 - The `--enforce-schema` flag and an [Input Schema](#input-schemas) should be used if type inference is not desired.
@@ -104,6 +105,20 @@ The flags for `max-token-count`, `max-buffer-size`, and `max-token-size` are typ
 - If the file has more than 2 fields, all subsequent fields are relationship properties that adhere to the same rules as node properties.
 - Described relationships are always considered to be directed (source->destination).
 
+### Input CSV example
+Store.csv
+```
+storeNum | Location | daysOpen |
+118 | 123 Main St | ['Mon', 'Wed', 'Fri']
+136 | 55 Elm St | ['Sat', 'Sun']
+```
+This CSV would be inserted with the command:
+`redisgraph-bulk-loader StoreGraph --separator \| --nodes Store.csv`
+
+(Since the pipe character has meaning in the terminal, it must be backslash-escaped.)
+
+All `storeNum` properties will be inserted as integers, `Location` will be inserted as strings, and `daysOpen` will be inserted as arrays of strings.
+
 ## Input Schemas
 If the `--enforce-schema` flag is specified, all input CSVs will be expected to specify each column's data type in the header.
 
@@ -122,6 +137,7 @@ The accepted data types are:
 | INT / INTEGER / LONG | A signed 64-bit integer value                                     |         Yes          |
 |       BOOLEAN        | A boolean value indicated by the string 'true' or 'false'         |         Yes          |
 |        STRING        | A string value                                                    |         Yes          |
+|        ARRAY         | An array value                                                    |         Yes          |
 
 If an `ID` column has a name string, the value will be added to each node as a property. Otherwise, it is internal to the bulk loader operation and will not appear in the graph. `START_ID` and `END_ID` columns will never be added as properties.
 
diff --git a/redisgraph_bulk_loader/entity_file.py b/redisgraph_bulk_loader/entity_file.py
@@ -1,11 +1,15 @@
 import os
 import io
 import csv
+import ast
+import sys
 import math
 import struct
 from enum import Enum
 from exceptions import CSVError, SchemaError
 
+csv.field_size_limit(sys.maxsize) # Don't limit the size of user input fields.
+
 
 class Type(Enum):
     UNKNOWN = 0
@@ -16,10 +20,11 @@ class Type(Enum):
     LONG = 4
     INT = 4         # alias to LONG
     INTEGER = 4     # alias to LONG
-    ID = 5
-    START_ID = 6
-    END_ID = 7
-    IGNORE = 8
+    ARRAY = 5
+    ID = 6
+    START_ID = 7
+    END_ID = 8
+    IGNORE = 9
 
 
 def convert_schema_type(in_type):
@@ -38,11 +43,28 @@ def convert_schema_type(in_type):
             raise SchemaError("Encountered invalid field type '%s'" % in_type)
 
 
+def array_prop_to_binary(format_str, prop_val):
+    # Evaluate the array to convert its elements.
+    # (This allows us to handle nested arrays.)
+    array_val = ast.literal_eval(prop_val)
+    # Send array length as a long.
+    array_to_send = struct.pack(format_str + "q", Type.ARRAY.value, len(array_val))
+    # Recursively send each array element as a string.
+    for elem in array_val:
+        array_to_send += inferred_prop_to_binary(str(elem))
+        # Return the full array struct.
+    return array_to_send
+
+
 # Convert a property field with an enforced type into a binary stream.
 # Supported property types are string, integer, float, and boolean.
 def typed_prop_to_binary(prop_val, prop_type):
     # All format strings start with an unsigned char to represent our prop_type enum
     format_str = "=B"
+
+    # Remove leading and trailing whitespace
+    prop_val = prop_val.strip()
+
     # TODO allow ID type specification
     if prop_type == Type.ID or prop_type == Type.LONG:
         try:
@@ -79,6 +101,11 @@ def typed_prop_to_binary(prop_val, prop_type):
         format_str += "%ds" % (len(encoded_str) + 1)
         return struct.pack(format_str, Type.STRING.value, encoded_str)
 
+    elif prop_type == Type.ARRAY:
+        if prop_val[0] != '[' or prop_val[-1] != ']':
+            raise SchemaError("Could not parse '%s' as an array" % prop_val)
+        return array_prop_to_binary(format_str, prop_val)
+
     # If it hasn't returned by this point, it is trying to set it to a type that it can't adopt
     raise Exception("unable to parse [" + prop_val + "] with type ["+repr(prop_type)+"]")
 
@@ -93,6 +120,9 @@ def inferred_prop_to_binary(prop_val):
         # TODO This is not allowed in Cypher, consider how to handle it here rather than in-module.
         return struct.pack(format_str, 0)
 
+    # Remove leading and trailing whitespace
+    prop_val = prop_val.strip()
+
     # Try to parse value as an integer.
     try:
         numeric_prop = int(prop_val)
@@ -114,6 +144,10 @@ def inferred_prop_to_binary(prop_val):
     elif prop_val.lower() == 'true':
         return struct.pack(format_str + '?', Type.BOOL.value, True)
 
+    # If the property string is bracket-interpolated, it is an array.
+    if prop_val[0] == '[' and prop_val[-1] == ']':
+        return array_prop_to_binary(format_str, prop_val)
+
     # If we've reached this point, the property is a string.
     encoded_str = str.encode(prop_val) # struct.pack requires bytes objects as arguments
     # Encoding len+1 adds a null terminator to the string
@@ -192,7 +226,7 @@ def convert_header_with_schema(self, header):
                 raise CSVError("Field '%s' had %d colons" % field, len(field))
 
             # Convert the column type.
-            col_type = convert_schema_type(pair[1].upper())
+            col_type = convert_schema_type(pair[1].upper().strip())
 
             # If the column did not have a name but the type requires one, emit an error.
             if len(pair[0]) == 0 and col_type not in (Type.ID, Type.START_ID, Type.END_ID, Type.IGNORE):
@@ -201,7 +235,8 @@ def convert_header_with_schema(self, header):
                 # We have a column name and a type.
                 # Only store the name if the column's values should be added as properties.
                 if len(pair[0]) > 0 and col_type not in (Type.START_ID, Type.END_ID, Type.IGNORE):
-                    self.column_names[idx] = pair[0]
+                    column_name = pair[0].strip()
+                    self.column_names[idx] = column_name
 
             # Store the column type.
             self.types[idx] = col_type
diff --git a/redisgraph_bulk_loader/label.py b/redisgraph_bulk_loader/label.py
@@ -18,6 +18,7 @@ def process_schemaless_header(self, header):
         self.id = 0
 
         for idx, field in enumerate(header):
+            field = field.strip()
             self.column_names[idx] = field
 
         if header[0][0] == '_':
diff --git a/redisgraph_bulk_loader/relation_type.py b/redisgraph_bulk_loader/relation_type.py
@@ -22,7 +22,7 @@ def process_schemaless_header(self, header):
         self.end_namespace = None
 
         for idx, field in enumerate(header[2:]):
-            self.column_names[idx+2] = field
+            self.column_names[idx+2] = field.strip()
 
     def post_process_header_with_schema(self, header):
         # Can interleave these tasks if preferred.
diff --git a/test/test_bulk_loader.py b/test/test_bulk_loader.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 
 import os
+import sys
 import csv
 import redis
 import unittest
@@ -584,6 +585,81 @@ def test13_id_namespaces(self):
                            [1, 'Filipe', 'User', 1, 40, 'Post']]
         self.assertEqual(query_result.result_set, expected_result)
 
+    def test14_array_properties_inferred(self):
+        """Validate that array properties are correctly inserted."""
+
+        graphname = "arr_graph"
+        with open('/tmp/nodes.tmp', mode='w') as csv_file:
+            out = csv.writer(csv_file, delimiter='|')
+            out.writerow(['str_col', 'arr_col'])
+            out.writerow(['str1', """[1, 0.2, 'nested_str', False]"""])
+            out.writerow(['str2', """['prop1', ['nested_1', 'nested_2'], 5]"""])
+
+        runner = CliRunner()
+        res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
+                                          '--separator', '|',
+                                          graphname], catch_exceptions=False)
+
+        self.assertEqual(res.exit_code, 0)
+        self.assertIn('2 nodes created', res.output)
+
+        graph = Graph(graphname, self.redis_con)
+        query_result = graph.query('MATCH (a) RETURN a ORDER BY a.str_col')
+
+        node_1 = {'str_col': 'str1', 'arr_col': [1, 0.2, 'nested_str', False]}
+        node_2 = {'str_col': 'str2', 'arr_col': ['prop1', ['nested_1', 'nested_2'], 5]}
+        self.assertEqual(query_result.result_set[0][0].properties, node_1)
+        self.assertEqual(query_result.result_set[1][0].properties, node_2)
+
+    def test15_array_properties_schema_enforced(self):
+        """Validate that array properties are correctly inserted with an enforced schema."""
+
+        graphname = "arr_graph_with_schema"
+        with open('/tmp/nodes.tmp', mode='w') as csv_file:
+            out = csv.writer(csv_file, delimiter='|')
+            out.writerow(['str_col:STRING', 'arr_col:ARRAY'])
+            out.writerow(['str1', """[1, 0.2, 'nested_str', False]"""])
+            out.writerow(['str2', """['prop1', ['nested_1', 'nested_2'], 5]"""])
+
+        runner = CliRunner()
+        res = runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
+                                          '--separator', '|',
+                                          '--enforce-schema',
+                                          graphname], catch_exceptions=False)
+
+        self.assertEqual(res.exit_code, 0)
+        self.assertIn('2 nodes created', res.output)
+
+        graph = Graph(graphname, self.redis_con)
+        query_result = graph.query('MATCH (a) RETURN a ORDER BY a.str_col')
+
+        node_1 = {'str_col': 'str1', 'arr_col': [1, 0.2, 'nested_str', False]}
+        node_2 = {'str_col': 'str2', 'arr_col': ['prop1', ['nested_1', 'nested_2'], 5]}
+        self.assertEqual(query_result.result_set[0][0].properties, node_1)
+        self.assertEqual(query_result.result_set[1][0].properties, node_2)
+
+    def test16_error_on_schema_failure(self):
+        """Validate that the loader errors on processing non-conformant CSVs with an enforced schema."""
+
+        graphname = "schema_error"
+        with open('/tmp/nodes.tmp', mode='w') as csv_file:
+            out = csv.writer(csv_file, delimiter='|')
+            out.writerow(['str_col:STRING', 'arr_col:ARRAY'])
+            out.writerow(['str1', """[1, 0.2, 'nested_str', False]"""])
+            out.writerow(['str2', 'strval'])
+
+        try:
+            runner = CliRunner()
+            runner.invoke(bulk_insert, ['--nodes', '/tmp/nodes.tmp',
+                                        '--separator', '|',
+                                        '--enforce-schema',
+                                        graphname], catch_exceptions=False)
+            self.fail() # Should be unreachable
+        except Exception as e:
+            # Verify that the correct exception is raised.
+            self.assertEqual(sys.exc_info()[0].__name__, 'SchemaError')
+            self.assertIn("Could not parse 'strval' as an array", e.args)
+
 
 if __name__ == '__main__':
     unittest.main()