Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion app/controllers/bulkrax/guided_imports_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def render_invalid_uploaded_files_response
# @param admin_set_id [String, nil] optional admin set ID for validation context
# @return [Hash] validation result data
def run_validation(csv_file, zip_file, admin_set_id: nil)
CsvValidationService.validate(csv_file: csv_file, zip_file: zip_file, admin_set_id: admin_set_id)
CsvParser.validate_csv(csv_file: csv_file, zip_file: zip_file, admin_set_id: admin_set_id)
end

def importer_params
Expand Down
2 changes: 1 addition & 1 deletion app/controllers/bulkrax/importers_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def new
# GET /importers/sample_csv_file
def sample_csv_file
admin_set_id = params[:admin_set_id].presence
sample = Bulkrax::CsvValidationService.generate_template(models: 'all', output: 'file', admin_set_id: admin_set_id)
sample = Bulkrax::CsvParser.generate_template(models: 'all', output: 'file', admin_set_id: admin_set_id)
send_file sample, filename: File.basename(sample), type: 'text/csv', disposition: 'attachment'
rescue StandardError => e
flash[:error] = "Unable to generate sample CSV file: #{e.message}"
Expand Down
9 changes: 7 additions & 2 deletions app/parsers/bulkrax/csv_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ module Bulkrax
class CsvParser < ApplicationParser # rubocop:disable Metrics/ClassLength
include ErroredEntries
include ExportBehavior
include CsvParser::CsvTemplateGeneration
include CsvParser::CsvValidation
attr_writer :collections, :file_sets, :works
attr_accessor :validation_mode

def self.export_supported?
true
Expand All @@ -15,8 +18,10 @@ def records(_opts = {})

file_for_import = only_updates ? parser_fields['partial_import_file_path'] : import_file_path
csv_data = entry_class.read_data(file_for_import)
importer.parser_fields['total'] = csv_data.count
importer.save
unless validation_mode
importer.parser_fields['total'] = csv_data.count
importer.save
end

@records = csv_data.map { |record_data| entry_class.data_for_entry(record_data, nil, self) }
@records
Expand Down
73 changes: 73 additions & 0 deletions app/parsers/concerns/bulkrax/csv_parser/csv_template_generation.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# frozen_string_literal: true

module Bulkrax
class CsvParser < ApplicationParser
module CsvTemplateGeneration
extend ActiveSupport::Concern

class_methods do
# Generate a CSV template for the specified models.
#
# @param models [Array<String>, String] Model names or 'all' for all available models
# @param output [String] Output format: 'file' or 'csv_string'
# @param admin_set_id [String, nil] Optional admin set ID for context
# @param args [Hash] Additional arguments passed to output method (e.g., file_path)
# @return [String] File path (for 'file' output) or CSV string (for 'csv_string' output)
def generate_template(models: [], output: 'file', admin_set_id: nil, **args)
raise NameError, "Hyrax is not defined" unless defined?(::Hyrax)
TemplateContext.new(models: models, admin_set_id: admin_set_id).send("to_#{output}", **args)
end
end

##
# Holds state for a single template generation run.
# Provides the interface expected by CsvTemplate:: components.
class TemplateContext
attr_reader :mappings, :all_models, :admin_set_id, :field_analyzer, :mapping_manager

def initialize(models: nil, admin_set_id: nil)
@admin_set_id = admin_set_id
@mapping_manager = CsvTemplate::MappingManager.new
@mappings = @mapping_manager.mappings
@field_analyzer = CsvTemplate::FieldAnalyzer.new(@mappings, admin_set_id)
@all_models = CsvTemplate::ModelLoader.new(Array.wrap(models)).models
@csv_builder = CsvTemplate::CsvBuilder.new(self)
end

def to_file(file_path: nil)
file_path ||= CsvTemplate::FilePathGenerator.default_path(@admin_set_id)
@csv_builder.write_to_file(file_path)
file_path
end

def to_csv_string
@csv_builder.generate_string
end

def field_metadata_for_all_models
@field_metadata ||= @all_models.each_with_object({}) do |model, hash|
field_list = @field_analyzer.find_or_create_field_list_for(model_name: model)
hash[model] = {
properties: field_list.dig(model, "properties") || [],
required_terms: field_list.dig(model, "required_terms") || [],
controlled_vocab_terms: field_list.dig(model, "controlled_vocab_terms") || []
}
end
end

def valid_headers_for_models
@valid_headers ||= begin
column_builder = CsvTemplate::ColumnBuilder.new(self)
all_columns = column_builder.all_columns
all_columns - CsvTemplate::CsvBuilder::IGNORED_PROPERTIES
rescue StandardError => e
Rails.logger.error("Error building valid headers: #{e.message}")
standard_fields = %w[model source_identifier parent parents file]
model_fields = field_metadata_for_all_models.values.flat_map { |m| m[:properties] }
(standard_fields + model_fields).uniq
end
end
end
end
end
end
257 changes: 257 additions & 0 deletions app/parsers/concerns/bulkrax/csv_parser/csv_validation.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
# frozen_string_literal: true

module Bulkrax
class CsvParser < ApplicationParser
module CsvValidation # rubocop:disable Metrics/ModuleLength
extend ActiveSupport::Concern

included do
# Lightweight struct used to satisfy the CsvTemplate::ColumnBuilder
# interface without constructing a full template context.
ValidationContext = Struct.new(:mapping_manager, :field_analyzer, :all_models, :mappings, keyword_init: true)
end

class_methods do
# Validate a CSV (and optional zip) without a persisted Importer record.
#
# @param csv_file [File, ActionDispatch::Http::UploadedFile, String] path or file object
# @param zip_file [File, ActionDispatch::Http::UploadedFile, nil]
# @param admin_set_id [String, nil]
# @return [Hash] validation result compatible with the guided import UI
def validate_csv(csv_file:, zip_file: nil, admin_set_id: nil) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
file_path = csv_file.respond_to?(:path) ? csv_file.path : csv_file.to_s

# 1. Read headers — use CsvEntry.read_data so header normalisation
# (special-char stripping, symbolisation) is identical to a real import.
raw_csv = CsvEntry.read_data(file_path)
headers = raw_csv.headers.map(&:to_s)

# 2. Field mappings / column name resolution
mapping_manager = CsvTemplate::MappingManager.new
mappings = mapping_manager.mappings

source_id_key = resolve_validation_key(mapping_manager, flag: 'source_identifier', default: :source_identifier)
parent_key = resolve_validation_key(mapping_manager, flag: 'related_parents_field_mapping', default: :parents)
children_key = resolve_validation_key(mapping_manager, flag: 'related_children_field_mapping', default: :children)
file_key = resolve_validation_key(mapping_manager, key: 'file', default: :file)

# 3. Parse rows — CsvEntry.read_data already filters blank rows and
# returns symbol-keyed rows (same as a real import).
csv_data = parse_validation_rows(raw_csv, source_id_key, parent_key, children_key, file_key)

# 4. Field metadata
all_models = csv_data.map { |r| r[:model] }.compact.uniq
field_analyzer = CsvTemplate::FieldAnalyzer.new(mappings, admin_set_id)
field_metadata = build_validation_field_metadata(all_models, field_analyzer)

# 5. Valid-header set (drives unrecognised-header detection)
valid_headers = build_valid_validation_headers(mapping_manager, field_analyzer, all_models, mappings, field_metadata)

# 6. Suffixed variants seen in this specific CSV (e.g. title_1, creator_2)
suffixed_headers = headers.select { |h| h.match?(/_\d+\z/) }
valid_headers = (valid_headers + suffixed_headers).uniq

# 7. Header-level checks
missing_required = find_missing_required_headers(headers, field_metadata, mapping_manager)
unrecognized = find_unrecognized_validation_headers(headers, valid_headers)

# 8. Row-level validators
parent_split = resolve_parent_split_pattern(mappings)
all_ids = csv_data.map { |r| r[:source_identifier] }.compact.to_set
validator_context = {
errors: [],
warnings: [],
seen_ids: {},
all_ids: all_ids,
source_identifier: source_id_key.to_s,
parent_split_pattern: parent_split,
mappings: mappings,
field_metadata: field_metadata
}

csv_data.each_with_index do |record, index|
row_number = index + 2 # 1-indexed, plus header row
Bulkrax.csv_row_validators.each { |v| v.call(record, row_number, validator_context) }
end

# 9. File validation
file_validator = CsvTemplate::FileValidator.new(csv_data, zip_file, admin_set_id)

# 10. Item hierarchy for UI display
collections, works, file_sets = extract_validation_items(csv_data)

# 11. Assemble result
row_errors = validator_context[:errors]
has_errors = missing_required.any? || headers.blank? || csv_data.empty? ||
file_validator.missing_files.any? || row_errors.any?
has_warnings = unrecognized.any? || file_validator.possible_missing_files?

result = {
headers: headers,
missingRequired: missing_required,
unrecognized: unrecognized,
rowCount: csv_data.length,
isValid: !has_errors,
hasWarnings: has_warnings,
rowErrors: row_errors,
collections: collections,
works: works,
fileSets: file_sets,
totalItems: csv_data.length,
fileReferences: file_validator.count_references,
missingFiles: file_validator.missing_files,
foundFiles: file_validator.found_files_count,
zipIncluded: file_validator.zip_included?
}

apply_rights_statement_validation_override!(result, missing_required)
result
end

private

# Resolve a symbol key from mappings for use as a record hash key.
# Returns a Symbol matching the parser's symbol-keyed record hash.
def resolve_validation_key(mapping_manager, key: nil, flag: nil, default:)
options = mapping_manager.resolve_column_name(key: key, flag: flag, default: default.to_s)
options.first&.to_sym || default
end

# Parse rows from a CsvEntry.read_data result into the canonical record shape.
# CsvEntry.read_data returns CSV::Row objects with symbol headers; blank rows
# are already filtered by CsvWrapper.
def parse_validation_rows(raw_csv, source_id_key, parent_key, children_key, file_key)
raw_csv.map do |row|
# CSV::Row#to_h converts symbol headers → string-keyed hash
row_hash = row.to_h.transform_keys(&:to_s)
{
source_identifier: row[source_id_key],
model: row[:model],
parent: row[parent_key],
children: row[children_key],
file: row[file_key],
raw_row: row_hash
}
end
rescue StandardError => e
Rails.logger.error("CsvParser.validate_csv: error parsing rows – #{e.message}")
[]
end

def build_validation_field_metadata(all_models, field_analyzer)
all_models.each_with_object({}) do |model, hash|
field_list = field_analyzer.find_or_create_field_list_for(model_name: model)
hash[model] = {
properties: field_list.dig(model, 'properties') || [],
required_terms: field_list.dig(model, 'required_terms') || [],
controlled_vocab_terms: field_list.dig(model, 'controlled_vocab_terms') || []
}
end
end

def build_valid_validation_headers(mapping_manager, field_analyzer, all_models, mappings, field_metadata)
svc = ValidationContext.new(
mapping_manager: mapping_manager,
field_analyzer: field_analyzer,
all_models: all_models,
mappings: mappings
)
all_cols = CsvTemplate::ColumnBuilder.new(svc).all_columns
all_cols - CsvTemplate::CsvBuilder::IGNORED_PROPERTIES
rescue StandardError => e
Rails.logger.error("CsvParser.validate_csv: error building valid headers – #{e.message}")
standard = %w[model source_identifier parent parents file]
model_fields = field_metadata.values.flat_map { |m| m[:properties] }
(standard + model_fields).uniq
end

def find_missing_required_headers(headers, field_metadata, mapping_manager)
csv_keys = headers.map { |h| mapping_manager.mapped_to_key(h).sub(/_\d+\z/, '') }.uniq
missing = []
field_metadata.each do |model, meta|
(meta[:required_terms] || []).each do |field|
missing << { model: model, field: field } unless csv_keys.include?(field)
end
end
missing.uniq
end

def find_unrecognized_validation_headers(headers, valid_headers)
checker = DidYouMean::SpellChecker.new(dictionary: valid_headers)
headers
.reject { |h| valid_headers.include?(h) || valid_headers.include?(h.sub(/_\d+\z/, '')) }
.index_with { |h| checker.correct(h).first }
end

def resolve_parent_split_pattern(mappings)
split_val = mappings.dig('parents', 'split') || mappings.dig(:parents, :split)
return nil if split_val.blank?
return Bulkrax::DEFAULT_MULTI_VALUE_ELEMENT_SPLIT_ON if split_val == true

split_val
end

def extract_validation_items(csv_data) # rubocop:disable Metrics/MethodLength
child_to_parents = build_child_to_parents_map(csv_data)
collections = []
works = []
file_sets = []

csv_data.each do |item|
categorise_validation_item(item, child_to_parents, collections, works, file_sets)
end

[collections, works, file_sets]
end

def build_child_to_parents_map(csv_data)
Hash.new { |h, k| h[k] = [] }.tap do |map|
csv_data.each do |item|
parse_relationship_field(item[:children]).each do |child_id|
map[child_id] << item[:source_identifier]
end
end
end
end

def categorise_validation_item(item, child_to_parents, collections, works, file_sets)
item_id = item[:source_identifier]
title = item[:raw_row]['title'] || item_id
model_str = item[:model].to_s

if model_str.casecmp('collection').zero? || model_str.casecmp('collectionresource').zero?
explicit = parse_relationship_field(item[:parent])
inferred = child_to_parents[item_id] || []
collections << { id: item_id, title: title, type: 'collection',
parentIds: (explicit + inferred).uniq,
childIds: parse_relationship_field(item[:children]) }
elsif model_str.casecmp('fileset').zero? || model_str.casecmp('hyrax::fileset').zero?
file_sets << { id: item_id, title: title, type: 'file_set' }
else
explicit = parse_relationship_field(item[:parent])
inferred = child_to_parents[item_id] || []
works << { id: item_id, title: title, type: 'work',
parentIds: (explicit + inferred).uniq,
childIds: parse_relationship_field(item[:children]) }
end
end

def parse_relationship_field(value)
return [] if value.blank?
value.to_s.split('|').map(&:strip).reject(&:blank?)
end

def apply_rights_statement_validation_override!(result, missing_required)
only_rights = missing_required.present? &&
missing_required.all? { |h| h[:field].to_s == 'rights_statement' }
return unless only_rights && !result[:isValid]
return if result[:headers].blank?
return if result[:missingFiles]&.any?

result[:isValid] = true
result[:hasWarnings] = true
end
end
end
end
end
Loading
Loading