From e75e8c2c5706a991d8373b21a08a7f35b50ea994 Mon Sep 17 00:00:00 2001 From: Dimitar Kumanov Date: Wed, 9 Oct 2019 12:03:19 +0300 Subject: [PATCH 1/6] Store document size of the physical file referenced in arkivstruktur --- .../main/java/com/documaster/validator/storage/model/Field.java | 1 + .../validation/noark5/parsers/ArchiveStructureHandler.java | 1 + .../src/main/resources/noark5/noark5.properties | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/noark-extraction-validator/src/main/java/com/documaster/validator/storage/model/Field.java b/noark-extraction-validator/src/main/java/com/documaster/validator/storage/model/Field.java index 10f8b9c..fde2b8f 100644 --- a/noark-extraction-validator/src/main/java/com/documaster/validator/storage/model/Field.java +++ b/noark-extraction-validator/src/main/java/com/documaster/validator/storage/model/Field.java @@ -32,6 +32,7 @@ public class Field { public static final String INTERNAL_ID = "_id"; public static final String DETECTED_CHECKSUM = "_detected_checksum"; + public static final String DETECTED_FILE_SIZE = "_detected_file_size"; public static final String DETECTED_FILE_TYPE = "_detected_type"; public static final String IS_VALID_FILE_TYPE = "_is_valid_type"; diff --git a/noark-extraction-validator/src/main/java/com/documaster/validator/validation/noark5/parsers/ArchiveStructureHandler.java b/noark-extraction-validator/src/main/java/com/documaster/validator/validation/noark5/parsers/ArchiveStructureHandler.java index f87a67b..0e7f199 100644 --- a/noark-extraction-validator/src/main/java/com/documaster/validator/validation/noark5/parsers/ArchiveStructureHandler.java +++ b/noark-extraction-validator/src/main/java/com/documaster/validator/validation/noark5/parsers/ArchiveStructureHandler.java @@ -121,6 +121,7 @@ public void endElement(String uri, String localName, String qName) throws SAXExc String contentType = isValidPdfA ? PDFAValidator.VALID_FILE_TYPE : PDFAValidator.getFileType(document); + getItem().add(Field.DETECTED_FILE_SIZE, document.length()); getItem().add(Field.DETECTED_FILE_TYPE, contentType); getItem().add(Field.DETECTED_CHECKSUM, checksum); getItem().add(Field.IS_VALID_FILE_TYPE, isValidPdfA); diff --git a/noark-extraction-validator/src/main/resources/noark5/noark5.properties b/noark-extraction-validator/src/main/resources/noark5/noark5.properties index 73ffab5..8599fcd 100644 --- a/noark-extraction-validator/src/main/resources/noark5/noark5.properties +++ b/noark-extraction-validator/src/main/resources/noark5/noark5.properties @@ -14,4 +14,4 @@ uniqueFields.loependejournal.journalpost = systemid uniqueFields.offentligjournal.journalpost = systemid # Additional fields that should be introduced per itemDef -additionalFields.arkivstruktur.dokumentobjekt = _detected_checksum, _detected_type, _is_valid_type +additionalFields.arkivstruktur.dokumentobjekt = _detected_checksum, _detected_file_size, _detected_type, _is_valid_type From 3ed6c979ab714d7ca1f54f5080972c11701c7534 Mon Sep 17 00:00:00 2001 From: Dimitar Kumanov Date: Wed, 9 Oct 2019 14:42:51 +0300 Subject: [PATCH 2/6] Add file size validation check to noark55 --- .../noark5/noark55/noark55-validation.xml | 33 ++++++++++++++++--- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/noark-extraction-validator/src/main/resources/noark5/noark55/noark55-validation.xml b/noark-extraction-validator/src/main/resources/noark5/noark55/noark55-validation.xml index dbf91c9..1650a40 100644 --- a/noark-extraction-validator/src/main/resources/noark5/noark55/noark55-validation.xml +++ b/noark-extraction-validator/src/main/resources/noark5/noark55/noark55-validation.xml @@ -1997,6 +1997,31 @@ + Document object file sizes + + Tests whether the document object file sizes specified in arkivstruktur.xml + match the ones with the physical files on the file system. + + arkivstruktur + + + _detected_file_size; + ]]> + + + _detected_file_size; + ]]> + + + + + Correspondence parties Provides information about the number of correspondence parties grouped @@ -2025,7 +2050,7 @@ - + Screenings Provides information about the number of screened series, classes, @@ -2160,7 +2185,7 @@ - + Disposal decisions Provides information about the number of disposal decisions related to series, classes, @@ -2295,7 +2320,7 @@ - + Disposals Provides information about the number of disposals of series @@ -2380,7 +2405,7 @@ - + Personal name fields Checks whether all name fields contain seemingly valid personal names. The regular expressions used for From 67e36d9cd6b39a26f072571b79ddd579684b7d53 Mon Sep 17 00:00:00 2001 From: Dimitar Kumanov Date: Wed, 9 Oct 2019 15:07:11 +0300 Subject: [PATCH 3/6] Add file size validation check for noark53 and noark54 --- .../noark5/noark53/noark53-validation.xml | 33 ++++++++++++++++--- .../noark5/noark54/noark54-validation.xml | 33 ++++++++++++++++--- 2 files changed, 58 insertions(+), 8 deletions(-) diff --git a/noark-extraction-validator/src/main/resources/noark5/noark53/noark53-validation.xml b/noark-extraction-validator/src/main/resources/noark5/noark53/noark53-validation.xml index 5bef23f..47669c6 100644 --- a/noark-extraction-validator/src/main/resources/noark5/noark53/noark53-validation.xml +++ b/noark-extraction-validator/src/main/resources/noark5/noark53/noark53-validation.xml @@ -1994,6 +1994,31 @@ + Document object file sizes + + Tests whether the document object file sizes specified in arkivstruktur.xml + match the ones with the physical files on the file system. + + arkivstruktur + + + _detected_file_size; + ]]> + + + _detected_file_size; + ]]> + + + + + Correspondence parties Provides information about the number of correspondence parties grouped @@ -2022,7 +2047,7 @@ - + Screenings Provides information about the number of screened series, classes, @@ -2157,7 +2182,7 @@ - + Disposal decisions Provides information about the number of disposal decisions related to series, classes, @@ -2292,7 +2317,7 @@ - + Disposals Provides information about the number of disposals of series @@ -2377,7 +2402,7 @@ - + Personal name fields Checks whether all name fields contain seemingly valid personal names. The regular expressions used for diff --git a/noark-extraction-validator/src/main/resources/noark5/noark54/noark54-validation.xml b/noark-extraction-validator/src/main/resources/noark5/noark54/noark54-validation.xml index ee0d0c8..0e698aa 100644 --- a/noark-extraction-validator/src/main/resources/noark5/noark54/noark54-validation.xml +++ b/noark-extraction-validator/src/main/resources/noark5/noark54/noark54-validation.xml @@ -1994,6 +1994,31 @@ + Document object file sizes + + Tests whether the document object file sizes specified in arkivstruktur.xml + match the ones with the physical files on the file system. + + arkivstruktur + + + _detected_file_size; + ]]> + + + _detected_file_size; + ]]> + + + + + Correspondence parties Provides information about the number of correspondence parties grouped @@ -2022,7 +2047,7 @@ - + Screenings Provides information about the number of screened series, classes, @@ -2157,7 +2182,7 @@ - + Disposal decisions Provides information about the number of disposal decisions related to series, classes, @@ -2292,7 +2317,7 @@ - + Disposals Provides information about the number of disposals of series @@ -2377,7 +2402,7 @@ - + Personal name fields Checks whether all name fields contain seemingly valid personal names. The regular expressions used for From 76f15940d3764311fea76ba370d17dbe865c989b Mon Sep 17 00:00:00 2001 From: Dimitar Kumanov Date: Wed, 9 Oct 2019 16:01:44 +0300 Subject: [PATCH 4/6] Update readme documentation --- README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 0659bc4..1787ba6 100644 --- a/README.md +++ b/README.md @@ -274,11 +274,12 @@ Below are all validation rules ordered by their execution priority (every test i |Test|Info,Warning|AST15|Document description period containment|Tests whether the created and finalized dates of all document descriptions are within the archival period specified in arkivuttrekk.xml.| |Test|Info,Error|AST16|Document object checksums|Tests whether the document object checksums specified in arkivstruktur.xml match the ones that the validator calculated using the SHA256 algorithm.| |Test|Info,Error|AST17|Document object file types|Tests whether all document objects are valid PDF/A-1B documents.| -|Test|Info,Error|AST18|Correspondence parties|Provides information about the number of correspondence parties grouped by series and tests whether any registry entries without correspondence parties exist.| -|Test|Info,Warning|AST19|Screenings|Provides information about the number of screened series, classes, files, records, and document descriptions, and tests whether the corresponding value in arkivuttrekk.xml (inneholderSkjermetInformasjon) is correct.| -|Test|Info,Warning|AST20|Disposal decisions|Provides information about the number of disposal decisions related to series, classes, files, records, and document descriptions, and tests whether the corresponding value in arkivuttrekk.xml (inneholderDokumenterSomSkalKasseres) is correct.| -|Test|Info,Warning|AST21|Disposals|Provides information about the number of disposals of series and document descriptions, and tests whether the corresponding value in arkivuttrekk.xml (omfatterDokumenterSomErKassert) is correct.| -|Test|Info,Warning|AST22|Personal name fields|Checks whether all name fields contain seemingly valid personal names. The regular expressions used for the purpose are "\^[\p{L}\s-'.]*$" and "\^[\p{L}\s-'.]+$" depending on whether the value can be blank or not. The validated fields are: saksansvarlig (M306), kontaktperson (M412), korrespondansepartNavn (M400), sakspartNavn (M302), moeteDeltakerNavn (M372), arkivertAv (M605), avskrevetAv (M618), tilknyttetAv (M621), merknadRegistrertAv (M612), kassertAv (M631), slettetAv (M614), gradertAv (M625), presedensGodkjentAv (M629), verifisertAv (M623), nedgradertAv (M627), opprettetAv (M601), avsluttetAv (M603).| +|Test|Info,Error|AST18|Document object file sizes|Tests whether the recorded file size in arkivstruktur.xml is exactly the same as the one of the referenced physical file on the system.| +|Test|Info,Error|AST19|Correspondence parties|Provides information about the number of correspondence parties grouped by series and tests whether any registry entries without correspondence parties exist.| +|Test|Info,Warning|AST20|Screenings|Provides information about the number of screened series, classes, files, records, and document descriptions, and tests whether the corresponding value in arkivuttrekk.xml (inneholderSkjermetInformasjon) is correct.| +|Test|Info,Warning|AST21|Disposal decisions|Provides information about the number of disposal decisions related to series, classes, files, records, and document descriptions, and tests whether the corresponding value in arkivuttrekk.xml (inneholderDokumenterSomSkalKasseres) is correct.| +|Test|Info,Warning|AST22|Disposals|Provides information about the number of disposals of series and document descriptions, and tests whether the corresponding value in arkivuttrekk.xml (omfatterDokumenterSomErKassert) is correct.| +|Test|Info,Warning|AST23|Personal name fields|Checks whether all name fields contain seemingly valid personal names. The regular expressions used for the purpose are "\^[\p{L}\s-'.]*$" and "\^[\p{L}\s-'.]+$" depending on whether the value can be blank or not. The validated fields are: saksansvarlig (M306), kontaktperson (M412), korrespondansepartNavn (M400), sakspartNavn (M302), moeteDeltakerNavn (M372), arkivertAv (M605), avskrevetAv (M618), tilknyttetAv (M621), merknadRegistrertAv (M612), kassertAv (M631), slettetAv (M614), gradertAv (M625), presedensGodkjentAv (M629), verifisertAv (M623), nedgradertAv (M627), opprettetAv (M601), avsluttetAv (M603).| **loependejournal:** From dba3823866f02b8069321f604bdc24cb0b1b97de Mon Sep 17 00:00:00 2001 From: Dimitar Kumanov Date: Thu, 10 Oct 2019 12:20:42 +0300 Subject: [PATCH 5/6] Reorder validation tests so that filesize validation is last --- .../noark5/noark53/noark53-validation.xml | 58 +++++++++---------- .../noark5/noark54/noark54-validation.xml | 57 +++++++++--------- .../noark5/noark55/noark55-validation.xml | 57 +++++++++--------- 3 files changed, 85 insertions(+), 87 deletions(-) diff --git a/noark-extraction-validator/src/main/resources/noark5/noark53/noark53-validation.xml b/noark-extraction-validator/src/main/resources/noark5/noark53/noark53-validation.xml index 47669c6..3351e78 100644 --- a/noark-extraction-validator/src/main/resources/noark5/noark53/noark53-validation.xml +++ b/noark-extraction-validator/src/main/resources/noark5/noark53/noark53-validation.xml @@ -1994,31 +1994,6 @@ - Document object file sizes - - Tests whether the document object file sizes specified in arkivstruktur.xml - match the ones with the physical files on the file system. - - arkivstruktur - - - _detected_file_size; - ]]> - - - _detected_file_size; - ]]> - - - - - Correspondence parties Provides information about the number of correspondence parties grouped @@ -2047,7 +2022,7 @@ - + Screenings Provides information about the number of screened series, classes, @@ -2182,7 +2157,7 @@ - + Disposal decisions Provides information about the number of disposal decisions related to series, classes, @@ -2317,7 +2292,7 @@ - + Disposals Provides information about the number of disposals of series @@ -2402,7 +2377,7 @@ - + Personal name fields Checks whether all name fields contain seemingly valid personal names. The regular expressions used for @@ -2893,6 +2868,31 @@ + + Document object file sizes + + Tests whether the document object file sizes specified in arkivstruktur.xml + match the ones with the physical files on the file system. + + arkivstruktur + + + _detected_file_size; + ]]> + + + _detected_file_size; + ]]> + + + +