-
Notifications
You must be signed in to change notification settings - Fork 57
feat: column mapping and auto column mapping #118
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
742e7a0
aaf752a
a96067b
4029a15
7659f43
e1caee9
7fef706
b8992c0
d1d9f31
4fbdb3c
881957b
18e4a95
6f6c8ed
0cc1034
375f0b9
ec2f122
6c15df0
0b67b31
8d5e553
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,13 +3,15 @@ package main | |
|
||
import ( | ||
"context" | ||
"encoding/json" | ||
"errors" | ||
"flag" | ||
"fmt" | ||
"io" | ||
"log" | ||
"os" | ||
"runtime" | ||
"strings" | ||
"time" | ||
|
||
"github.com/timescale/timescaledb-parallel-copy/pkg/csvcopy" | ||
|
@@ -33,11 +35,14 @@ var ( | |
quoteCharacter string | ||
escapeCharacter string | ||
|
||
fromFile string | ||
columns string | ||
skipHeader bool | ||
headerLinesCnt int | ||
skipBatchErrors bool | ||
fromFile string | ||
columns string | ||
columnMapping string | ||
autoColumnMapping bool | ||
skipHeader bool | ||
headerLinesCnt int | ||
skipLines int | ||
skipBatchErrors bool | ||
|
||
importID string | ||
workers int | ||
|
@@ -68,8 +73,12 @@ func init() { | |
flag.StringVar(&escapeCharacter, "escape", "", "The ESCAPE `character` to use during COPY (default '\"')") | ||
flag.StringVar(&fromFile, "file", "", "File to read from rather than stdin") | ||
flag.StringVar(&columns, "columns", "", "Comma-separated columns present in CSV") | ||
flag.StringVar(&columnMapping, "column-mapping", "", "Column mapping from CSV to database columns (format: \"csv_col1:db_col1,csv_col2:db_col2\" or JSON)") | ||
flag.BoolVar(&autoColumnMapping, "auto-column-mapping", false, "Automatically map CSV headers to database columns with the same names") | ||
|
||
flag.BoolVar(&skipHeader, "skip-header", false, "Skip the first line of the input") | ||
flag.IntVar(&headerLinesCnt, "header-line-count", 1, "Number of header lines") | ||
flag.IntVar(&headerLinesCnt, "header-line-count", 1, "(deprecated) Number of header lines") | ||
flag.IntVar(&skipLines, "skip-lines", 0, "Skip the first n lines of the input. it is applied before skip-header") | ||
|
||
flag.BoolVar(&skipBatchErrors, "skip-batch-errors", false, "if true, the copy will continue even if a batch fails") | ||
|
||
|
@@ -103,6 +112,11 @@ func main() { | |
if dbName != "" { | ||
log.Fatalf("Error: Deprecated flag -db-name is being used. Update -connection to connect to the given database") | ||
} | ||
|
||
if headerLinesCnt != 1 { | ||
log.Fatalf("Error: -header-line-count is deprecated. Use -skip-lines instead") | ||
} | ||
|
||
logger := &csvCopierLogger{} | ||
|
||
opts := []csvcopy.Option{ | ||
|
@@ -127,6 +141,18 @@ func main() { | |
opts = append(opts, csvcopy.WithImportID(importID)) | ||
} | ||
|
||
if columnMapping != "" { | ||
mapping, err := parseColumnMapping(columnMapping) | ||
if err != nil { | ||
log.Fatalf("Error parsing column mapping: %v", err) | ||
} | ||
opts = append(opts, csvcopy.WithColumnMapping(mapping)) | ||
} | ||
|
||
if autoColumnMapping { | ||
opts = append(opts, csvcopy.WithAutoColumnMapping()) | ||
} | ||
|
||
batchErrorHandler := csvcopy.BatchHandlerError() | ||
if skipBatchErrors { | ||
batchErrorHandler = csvcopy.BatchHandlerNoop() | ||
|
@@ -136,10 +162,12 @@ func main() { | |
} | ||
opts = append(opts, csvcopy.WithBatchErrorHandler(batchErrorHandler)) | ||
|
||
if skipLines > 0 { | ||
opts = append(opts, csvcopy.WithSkipHeaderCount(skipLines)) | ||
} | ||
|
||
if skipHeader { | ||
opts = append(opts, | ||
csvcopy.WithSkipHeaderCount(headerLinesCnt), | ||
) | ||
opts = append(opts, csvcopy.WithSkipHeader(true)) | ||
} | ||
|
||
copier, err := csvcopy.NewCopier( | ||
|
@@ -190,3 +218,73 @@ func main() { | |
} | ||
fmt.Println(res) | ||
} | ||
|
||
// parseColumnMapping parses column mapping string into csvcopy.ColumnsMapping | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we should move these to a config package and create some tests to validate the mapping works. We should also test weird valid Postgres column names. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll rather keep this function near in main.go file than to create a separate package. Specially given this is specific to the cli interface. |
||
// Supports two formats: | ||
// 1. Simple: "csv_col1:db_col1,csv_col2:db_col2" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need to support 2 methods? Wouldn't one simplify UX for the user? Have you run tests with random column names that need to be quoted? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. simple method works for most of use cases and is very comfortable to type in the terminal. BUT as you already noticed, it will fail with strange column names as it will provably conflict with terminal syntax. the json format is bullet proof. as json encoding is well defined and you can easily define any column name you want without having to worry about your terminal. This includes syntax to quote characters. So there is no need for extra validation. IF you mess it up, the code will fail on the first attempt to insert data into your database. It also doesn't make a lot of sense to have unit tests for this. As it is just json.Unmarshall |
||
// 2. JSON: {"csv_col1":"db_col1","csv_col2":"db_col2"} | ||
func parseColumnMapping(mappingStr string) (csvcopy.ColumnsMapping, error) { | ||
if mappingStr == "" { | ||
return nil, nil | ||
} | ||
|
||
mappingStr = strings.TrimSpace(mappingStr) | ||
|
||
// Check if it's JSON format (starts with '{') | ||
if strings.HasPrefix(mappingStr, "{") { | ||
return parseJSONColumnMapping(mappingStr) | ||
} | ||
|
||
// Parse simple format: "csv_col1:db_col1,csv_col2:db_col2" | ||
return parseSimpleColumnMapping(mappingStr) | ||
} | ||
|
||
// parseJSONColumnMapping parses JSON format column mapping | ||
func parseJSONColumnMapping(jsonStr string) (csvcopy.ColumnsMapping, error) { | ||
var mappingMap map[string]string | ||
if err := json.Unmarshal([]byte(jsonStr), &mappingMap); err != nil { | ||
return nil, fmt.Errorf("invalid JSON format for column mapping: %w", err) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's return the correct format as part of the error. |
||
} | ||
|
||
var mapping csvcopy.ColumnsMapping | ||
for csvCol, dbCol := range mappingMap { | ||
mapping = append(mapping, csvcopy.ColumnMapping{ | ||
CSVColumnName: csvCol, | ||
DatabaseColumnName: dbCol, | ||
}) | ||
} | ||
|
||
return mapping, nil | ||
} | ||
|
||
// parseSimpleColumnMapping parses simple format: "csv_col1:db_col1,csv_col2:db_col2" | ||
func parseSimpleColumnMapping(simpleStr string) (csvcopy.ColumnsMapping, error) { | ||
pairs := strings.Split(simpleStr, ",") | ||
var mapping csvcopy.ColumnsMapping | ||
|
||
for i, pair := range pairs { | ||
pair = strings.TrimSpace(pair) | ||
if pair == "" { | ||
continue | ||
} | ||
|
||
parts := strings.Split(pair, ":") | ||
if len(parts) != 2 { | ||
return nil, fmt.Errorf("invalid column mapping format at position %d: '%s', expected 'csv_column:db_column'", i+1, pair) | ||
} | ||
|
||
csvCol := strings.TrimSpace(parts[0]) | ||
dbCol := strings.TrimSpace(parts[1]) | ||
|
||
if csvCol == "" || dbCol == "" { | ||
return nil, fmt.Errorf("empty column name in mapping at position %d: '%s'", i+1, pair) | ||
} | ||
|
||
mapping = append(mapping, csvcopy.ColumnMapping{ | ||
CSVColumnName: csvCol, | ||
DatabaseColumnName: dbCol, | ||
}) | ||
} | ||
|
||
return mapping, nil | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why are we deprecating this flag? And why in the context of this PR?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it is in the thread above #118 (comment)