Skip to content

Commit 07cf820

Browse files
authored
Merge pull request #686 from IU-Libraries-Joint-Development/essi-2130_remote_file_retrieval
[ESSI-2130] apply Fedora/S3 file retrieval for OCR generation, characterization, PDF generation, download
2 parents 23d37a6 + fe4ef53 commit 07cf820

File tree

8 files changed

+202
-14
lines changed

8 files changed

+202
-14
lines changed

app/jobs/characterize_job.rb

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,9 @@ class CharacterizeJob < ApplicationJob
1010
def perform(file_set, file_id, filepath = nil, derivation_path = nil, delete_characterization_path = false)
1111
raise "#{file_set.class.characterization_proxy} was not found for FileSet #{file_set.id}" unless file_set.characterization_proxy?
1212
unless filepath && File.exist?(filepath)
13-
if file_set.content_location&.start_with?('s3://') # Ensure external file is available locally
14-
ext_id = file_set.content_location.split('/').last
15-
ext_resp = ESSI.external_storage.get(ext_id)
16-
filepath = Hyrax::WorkingDirectory.send(:copy_stream_to_working_directory, ext_id, ext_id, ext_resp.body)
17-
delete_characterization_path = true
18-
else
19-
filepath = Hyrax::WorkingDirectory.find_or_retrieve(file_id, file_set.id) unless filepath && File.exist?(filepath)
20-
delete_characterization_path = false
21-
end
13+
delete_characterization_path = file_set.external?
2214
end
15+
filepath = file_set.find_or_retrieve(file_id: file_id, filepath: filepath)
2316
Hydra::Works::CharacterizationService.run(file_set.characterization_proxy, filepath)
2417
Rails.logger.debug "Ran characterization on #{file_set.characterization_proxy.id} (#{file_set.characterization_proxy.mime_type})"
2518
file_set.characterization_proxy.save!

app/jobs/create_ocr_job.rb

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,12 @@ class CreateOCRJob < CreateDerivativesJob
66
# @param [String, NilClass] filepath the cached file within the Hyrax.config.working_path
77
def perform(file_set, file_id, filepath = nil)
88
return if file_set.video? && !Hyrax.config.enable_ffmpeg
9-
filename = Hyrax::WorkingDirectory.find_or_retrieve(file_id, file_set.id, filepath)
9+
10+
filepath = file_set.find_or_retrieve(file_id: file_id, filepath: filepath)
1011

1112
# using #create_derivatives instead of #create_ocr_derivatives to use IIIF Print
1213
# @see https://github.com/scientist-softserv/iiif_print/blob/d14246664048c708071c7ff4de2e9a34aa703465/app/services/iiif_print/pluggable_derivative_service.rb#L25
13-
file_set.send(:file_set_derivatives_service).send(:create_derivatives, filename)
14+
file_set.send(:file_set_derivatives_service).send(:create_derivatives, filepath)
1415

1516
# Reload from Fedora and reindex for thumbnail and extracted text
1617
file_set.reload

app/models/file_set.rb

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,23 @@ def ocr_language
2929
def original_file_id
3030
@original_file_id ||= self.original_file&.id
3131
end
32+
33+
def external?
34+
ESSI.external_storage.external?(self)
35+
end
36+
37+
def external_id
38+
@external_id ||= ESSI.external_storage.external_id(self)
39+
end
40+
41+
# supplement to Hyrax::WorkingDirectory.find_or_retrieve, but aware of external storage
42+
def find_or_retrieve(file_id: original_file&.id, filepath: nil)
43+
return filepath if filepath && File.exist?(filepath)
44+
if self.external?
45+
filepath = ESSI.external_storage.find_or_retrieve(self, file_id: file_id, filepath: filepath)
46+
else
47+
filepath = Hyrax::WorkingDirectory.find_or_retrieve(file_id, self.id, filepath: filepath)
48+
end
49+
return filepath
50+
end
3251
end

app/services/essi/external_storage_service.rb

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,28 @@ def prefix_id(id)
9595
"#{@prefix}/#{treeify_id(id)}"
9696
end
9797

98+
def external?(file_set)
99+
file_set.content_location&.start_with?('s3://') || false
100+
end
101+
102+
def external_id(file_set)
103+
file_set.content_location.split('/').last if external?(file_set)
104+
end
105+
106+
# external equivalent to Hyrax::WorkingDirectory.find_or_retrieve
107+
def find_or_retrieve(file_set, file_id: file_set.original_file&.id, filepath: nil)
108+
return filepath if filepath && File.exist?(filepath)
109+
if external?(file_set)
110+
ext_id = external_id(file_set)
111+
ext_resp = get(ext_id)
112+
filepath = Hyrax::WorkingDirectory.send(:copy_stream_to_working_directory, ext_id, ext_id, ext_resp.body)
113+
else
114+
Rails.logger.warn("External storage find_or_retrieve called for FileSet #{file_set.id} not stored externally")
115+
filepath = nil
116+
end
117+
return filepath
118+
end
119+
98120
private
99121

100122
def endpoint

app/services/essi/generate_pdf_service.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def render_dimensions(image_width)
7373
def generate_width(file_set_id)
7474
begin
7575
file_set = FileSet.find(file_set_id)
76-
filepath = Hyrax::WorkingDirectory.find_or_retrieve(file_set.original_file.id, file_set.id)
76+
filepath = file_set.find_or_retrieve
7777
terms = Hydra::Works::CharacterizationService.run(file_set.original_file, filepath)
7878
CharacterizeJob.perform_later(file_set, file_set.original_file.id)
7979
rescue

lib/extensions/hyrax/downloads_controller/external_storage.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ module ExternalStorage
66
def show
77
if params[:file] && params[:file] == 'extracted_text'
88
super
9-
elsif asset.content_location&.start_with?('s3://')
10-
ext_id = asset.content_location.split('/').last
9+
elsif asset.try(:external?)
10+
ext_id = asset.external_id
1111
external_asset = ESSI.external_storage.get(ext_id)
1212
send_data external_asset.body.read, filename: ext_id
1313
else

spec/models/file_set_spec.rb

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,78 @@
3636
end
3737
end
3838
end
39+
40+
let(:external_id) { "s3_id" }
41+
let(:external_location) { "s3://#{external_id}" }
42+
43+
describe "#external?" do
44+
context "when stored in S3" do
45+
before { allow(file_set).to receive(:content_location).and_return(external_location) }
46+
it "returns true" do
47+
expect(file_set.external?).to eq true
48+
end
49+
end
50+
context "when stored in Fedora" do
51+
it "returns false" do
52+
expect(file_set.external?).to eq false
53+
end
54+
end
55+
end
56+
57+
describe "#external_id" do
58+
context "when stored in S3" do
59+
before { allow(file_set).to receive(:content_location).and_return(external_location) }
60+
it "returns the S3 internal id" do
61+
expect(file_set.external_id).to eq external_id
62+
end
63+
end
64+
context "when stored in Fedora" do
65+
it "returns nil" do
66+
expect(file_set.external_id).to be_nil
67+
end
68+
end
69+
end
70+
71+
describe "#find_or_retrieve" do
72+
shared_examples "find_or_retrieve examples" do |argument_filepath|
73+
context "when file is stored in S3" do
74+
before { allow(file_set).to receive(:content_location).and_return(external_location) }
75+
let(:output_filepath) { 'filepath_from_s3' }
76+
before { allow(ESSI.external_storage).to receive(:find_or_retrieve).and_return(output_filepath) }
77+
it "calls ESSI.external_storage.find_or_retrieve" do
78+
expect(ESSI.external_storage).to receive(:find_or_retrieve)
79+
file_set.find_or_retrieve(filepath: argument_filepath)
80+
end
81+
it "returns filepath" do
82+
expect(file_set.find_or_retrieve(filepath: argument_filepath)).to eq output_filepath
83+
end
84+
end
85+
context "when file is stored in Fedora" do
86+
let(:output_filepath) { 'filepath_from_fedora' }
87+
before { allow(Hyrax::WorkingDirectory).to receive(:find_or_retrieve).and_return(output_filepath) }
88+
it "calls Hyrax::WorkingDirectory.find_or_retrieve" do
89+
expect(Hyrax::WorkingDirectory).to receive(:find_or_retrieve)
90+
file_set.find_or_retrieve(filepath: argument_filepath)
91+
end
92+
it "returns filepath" do
93+
expect(file_set.find_or_retrieve(filepath: argument_filepath)).to eq output_filepath
94+
end
95+
end
96+
end
97+
context "when filepath provided" do
98+
let(:argument_filepath) { '/tmp/existing_file.txt' }
99+
context "when file present" do
100+
before { allow(File).to receive(:exist?).with(argument_filepath).and_return(true) }
101+
it "returns the filepath" do
102+
expect(file_set.find_or_retrieve(filepath: argument_filepath)).to eq argument_filepath
103+
end
104+
end
105+
context "when file absent" do
106+
include_examples "find_or_retrieve examples", '/tmp/existing_file.txt'
107+
end
108+
end
109+
context "when filepath not provided" do
110+
include_examples "find_or_retrieve examples", nil
111+
end
112+
end
39113
end

spec/services/essi/external_storage_service_spec.rb

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,4 +42,83 @@
4242
expect(service.prefix_id(id)).to eq('ext-store/es/si/-e/xt/essi-ext-store-spec')
4343
end
4444
end
45+
46+
let(:file_set) { FactoryBot.build(:file_set) }
47+
let(:external_id) { "s3_id" }
48+
let(:external_location) { "s3://#{external_id}" }
49+
describe "#external?" do
50+
context "when stored in S3" do
51+
before { allow(file_set).to receive(:content_location).and_return(external_location) }
52+
it "returns true" do
53+
expect(service.external?(file_set)).to eq true
54+
end
55+
end
56+
context "when stored in Fedora" do
57+
it "returns false" do
58+
expect(service.external?(file_set)).to eq false
59+
end
60+
end
61+
end
62+
63+
describe "#external_id" do
64+
context "when stored in S3" do
65+
before { allow(file_set).to receive(:content_location).and_return(external_location) }
66+
it "returns the S3 internal id" do
67+
expect(service.external_id(file_set)).to eq external_id
68+
end
69+
end
70+
context "when stored in Fedora" do
71+
it "returns nil" do
72+
expect(file_set.external_id).to be_nil
73+
end
74+
end
75+
end
76+
77+
describe "#find_or_retrieve" do
78+
shared_examples "find_or_retrieve examples" do |argument_filepath|
79+
context "when file is stored in S3" do
80+
let(:output_filepath) { 'filepath_from_s3' }
81+
before do
82+
allow(file_set).to receive(:content_location).and_return("s3://server/external_id")
83+
allow(service).to receive(:get).and_return(double(body: nil))
84+
allow(Hyrax::WorkingDirectory).to receive(:copy_stream_to_working_directory).and_return(output_filepath)
85+
end
86+
it "retrieves external file content" do
87+
expect(service).to receive(:get)
88+
service.find_or_retrieve(file_set, filepath: argument_filepath)
89+
end
90+
it "copies stream to working directory" do
91+
expect(Hyrax::WorkingDirectory).to receive(:copy_stream_to_working_directory)
92+
service.find_or_retrieve(file_set, filepath: argument_filepath)
93+
end
94+
it "returns filepath" do
95+
expect(service.find_or_retrieve(file_set, filepath: argument_filepath)).to eq output_filepath
96+
end
97+
end
98+
context "when file is stored in Fedora" do
99+
it "logs warning" do
100+
expect(Rails.logger).to receive(:warn)
101+
service.find_or_retrieve(file_set, filepath: argument_filepath)
102+
end
103+
it "returns nil" do
104+
expect(service.find_or_retrieve(file_set, filepath: argument_filepath)).to be_nil
105+
end
106+
end
107+
end
108+
context "when filepath provided" do
109+
let(:argument_filepath) { '/tmp/existing_file.txt' }
110+
context "when file present" do
111+
before { allow(File).to receive(:exist?).with(argument_filepath).and_return(true) }
112+
it "returns the filepath" do
113+
expect(service.find_or_retrieve(file_set, filepath: argument_filepath)).to eq argument_filepath
114+
end
115+
end
116+
context "when file absent" do
117+
include_examples "find_or_retrieve examples", '/tmp/existing_file.txt'
118+
end
119+
end
120+
context "when filepath not provided" do
121+
include_examples "find_or_retrieve examples", nil
122+
end
123+
end
45124
end

0 commit comments

Comments
 (0)