Skip to content

Commit d414d34

Browse files
authored
Merge pull request #906 from clarin-eric/devel
Devel-data
2 parents dd710c3 + 8738e1e commit d414d34

File tree

117 files changed

+28770
-372272
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

117 files changed

+28770
-372272
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
tmp/
22
*.tmp
33
nohup.*
4+
*.gz
45
*.zip
56
*.tar
67
*.tgz

Build/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
00*.txt
22
Temp/
3+
Test/
34
Logs/

Build/Makefile

Lines changed: 57 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
### COMPLETE SET OF CORPORA
88
#CORPORA=AT BA BE BG CZ DK EE ES ES-CT ES-GA ES-PV FI FR GB GR HR HU IS IT LV NL NO PL PT RS SE SI TR UA
9-
CORPORA=SI
9+
CORPORA=AT
1010
# Used in targets that run only for one corpus
1111
CORPUS=
1212

@@ -28,6 +28,11 @@ HANDLE-TEI = http://hdl.handle.net/11356/2004
2828
HANDLE-ANA = http://hdl.handle.net/11356/2005
2929
HANDLE-MT = http://hdl.handle.net/11356/2006
3030

31+
# For IL only:
32+
#VERSION = 1.0
33+
#HANDLE-TEI = http://hdl.handle.net/11356/2032
34+
#HANDLE-ANA = http://hdl.handle.net/11356/2032
35+
3136
#Where the produced corpora are put for inspection
3237
WEB = tomaz@nl.ijs.si:/home/tomaz/www/tmp/ParlaMint
3338

@@ -73,8 +78,8 @@ test-tei1:
7378
test-vert6:
7479
../Scripts/parlamintp-tei2vert-xx.pl ${HERE}/Distro/ParlaMint-LV.TEI.ana Test/ParlaMint-LV-xx.vert
7580
test-vert5:
76-
$s meta=../Build/Distro/ParlaMint-DK.TEI.ana/ParlaMint-DK.ana.xml -xsl:../Scripts/parlamint2xmlvert.xsl \
77-
../Build/Distro/ParlaMint-DK.TEI.ana/2020/ParlaMint-DK_2020-01-07-20191-M42.ana.xml > Test/test-DK.vert
81+
$s meta=../Build/Distro/ParlaMint-SI.TEI.ana/ParlaMint-SI.ana.xml -xsl:../Scripts/parlamint2xmlvert.xsl \
82+
../Build/Distro/ParlaMint-SI.TEI.ana/2020/ParlaMint-SI_2020-03-13-SDZ8-Izredna-31.ana.xml > Test/test-SI.vert
7883
test-vert4:
7984
$s meta=${HERE}/Distro/ParlaMint-SI.TEI.ana/ParlaMint-SI.ana.xml -xsl:../Scripts/parlamint2xmlvert.xsl \
8085
${HERE}/Distro/ParlaMint-SI.TEI.ana/2022/ParlaMint-SI_2022-01-13-SDZ8-Izredna-93.ana.xml > test-SI.vert
@@ -83,22 +88,27 @@ test-vert2:
8388
test-vert1:
8489
${FINALIZE} -vert -codes GR -in ${HERE}/Temp -out ${HERE}/Temp
8590
${FINALIZE} -vert -codes GR-en -in ${HERE}/Temp -out ${HERE}/Temp
91+
test-conll5:
92+
../Scripts/parlamintp2conllu.pl -jobs 1 -in ../Build/Sources-TEI/ParlaMint-AT.TEI.ana -out ../Build/Test
8693
test-conll4:
87-
$s meta=../Build/Distro/ParlaMint-AT.TEI.ana/ParlaMint-AT.ana.xml \
88-
-xsl:../Scripts/parlamint2conllu.xsl ../Build/Distro/ParlaMint-AT.TEI.ana/1996/ParlaMint-AT_1996-01-15-020-XX-NRSITZ-00003.ana.xml \
89-
> Test/test-AT.conllu
94+
$s meta=../Build/Distro/ParlaMint-SI.TEI.ana/ParlaMint-SI.ana.xml \
95+
-xsl:../Scripts/parlamint2conllu.xsl ../Build/Distro/ParlaMint-SI.TEI.ana/2000/ParlaMint-SI_2000-10-27-SDZ3-Redna-01.ana.xml \
96+
> Test/test-SI.conllu
9097
test-conll3:
9198
${FINALIZE} -conll -codes GR-en -in ${HERE}/Temp -out ${HERE}/Temp
9299
test-conll2:
93100
${FINALIZE} -conll -codes GR -in ${HERE}/Temp -out ${HERE}/Temp
94101
test-conll1:
95102
${FINALIZE} -conll -codes UA -in ${HERE}/Distro -out ${HERE}/Distro
103+
test-meta6:
104+
$s out-lang=en meta=../Build/Sources-TEI/ParlaMint-FR.TEI.ana/ParlaMint-FR.ana.xml -xsl:../Scripts/parlamint2meta.ana.xsl \
105+
../Build/Sources-TEI/ParlaMint-FR.TEI.ana/2020/ParlaMint-FR_2020-01-07-O1114.ana.xml > Test/test.ana-meta-en.tsv
96106
test-meta5:
97-
$s out-lang=en meta=../Build/Distro/ParlaMint-SE.TEI/ParlaMint-SE.xml -xsl:../Scripts/parlamint2meta.xsl \
98-
../Build/Distro/ParlaMint-SE.TEI/2015/ParlaMint-SE_2015-11-04-prot-201516--21.xml > test.tsv
107+
$s out-lang=en meta=../Build/Distro/ParlaMint-SI.TEI/ParlaMint-SI.xml -xsl:../Scripts/parlamint2meta.xsl \
108+
../Build/Distro/ParlaMint-SI.TEI/2000/ParlaMint-SI_2000-10-27-SDZ3-Redna-01.xml > Test/test.tsv
99109
test-meta4:
100-
$s out-lang=en meta=../Build/Distro/ParlaMint-PT.TEI/ParlaMint-PT.xml -xsl:../Scripts/parlamint2meta.xsl \
101-
../Build/Distro/ParlaMint-PT.TEI/2022/ParlaMint-PT_2022-03-22.xml > test.tsv
110+
$s out-lang=xx meta=../Build/Distro/ParlaMint-SI/ParlaMint-SI.xml -xsl:../Scripts/parlamint2meta.xsl \
111+
../Build/Distro/ParlaMint-SI/2007/ParlaMint-SI_2007-11-28-SDZ4-Izredna-30.ana.xml > Test/test.tsv
102112
test-meta3:
103113
$s out-lang=en meta=../Build/Distro/ParlaMint-RS.TEI/ParlaMint-RS.xml -xsl:../Scripts/parlamint2meta.xsl \
104114
../Build/Distro/ParlaMint-RS.TEI/2016/ParlaMint-RS_2016-06-03-0.xml > test.tsv
@@ -111,7 +121,17 @@ test-meta1:
111121
${FINALIZE} -txt -codes IS -in ${HERE}/Temp -out ${HERE}/Temp
112122
#${FINALIZE} -txt -codes GR-en -in ${HERE}/Temp -out ${HERE}/Temp
113123
test-text1:
114-
${FINALIZE} -txt -codes ES-CT -in ${HERE}/Distro -out ${HERE}/Distro
124+
${FINALIZE} -txt -codes ES -in ${HERE}/Distro -out ${HERE}/Distro
125+
test-text2:
126+
../Scripts/parlamintp-tei2text.pl -jobs 2 -in ../Build/Distro/ParlaMint-SI -out ../Build/Distro/ParlaMint-SI
127+
test-valid1:
128+
${FINALIZE} -valid -codes SI -in ${HERE}/Distro -out ${HERE}/Distro
129+
test-fix1:
130+
$s anaDir=../Build/Distro/ParlaMint-RS outDir=Test -xsl:../Scripts/parlamint2release.xsl \
131+
../Build/Distro/ParlaMint-RS/ParlaMint-RS.xml
132+
test-fix2:
133+
$s anaDir=../Build/Distro/ParlaMint-AT outDir=Test -xsl:../Scripts/parlamint2release.xsl \
134+
../Build/Distro/ParlaMint-AT/ParlaMint-AT.xml
115135

116136
### Fixes
117137
# Merge per-language translated CoNLL-Us (BE, ES-CT, ES-PV, UA) to joint CoNLL-U (with # lang info on newpar)
@@ -205,7 +225,7 @@ web:
205225
rsync -av Packed/*.tgz ${WEB}/Repo
206226

207227
###### Targets for producing releasable version of ParlaMint corpora
208-
FINALIZE = perl ../Scripts/parlamint2distro.pl -version ${VERSION} -teihandle ${HANDLE-TEI} -anahandle ${HANDLE-ANA} -schema ../Schema -docs Sources-Distro
228+
FINALIZE = perl ../Scripts/parlamint2distro.pl -version ${VERSION} -teihandle ${HANDLE-TEI} -anahandle ${HANDLE-ANA} -schema ../Schema -docs Sources-Distro -procMemGB ${JAVA-MEMORY} -procChunkSize ${CHUNK-SIZE} -procThreads ${THREADS}
209229

210230
### For real
211231
# More than one nohup, in case we want several runs at once
@@ -217,7 +237,7 @@ nohup2:
217237
nohup3:
218238
nice nohup time make all > Logs/ParlaMint.3.log &
219239

220-
all: final
240+
all: final verts
221241
xall: final verts pack
222242

223243
pack:
@@ -244,7 +264,7 @@ final:
244264
### Make MTed corpora
245265

246266
# Make distribution with:
247-
FINALIZE-MT=perl ../Scripts/parlamint2distro.pl -version ${VERSION} -anahandle ${HANDLE-MT} -schema ${PARLAMINT}/Schema -docs ${HERE}/Sources-Distro
267+
FINALIZE-MT=perl ../Scripts/parlamint2distro.pl -version ${VERSION} -anahandle ${HANDLE-MT} -schema ${PARLAMINT}/Schema -docs ${HERE}/Sources-Distro -procMemGB ${JAVA-MEMORY} -procChunkSize ${CHUNK-SIZE} -procThreads ${THREADS}
248268

249269
# Targets
250270
mt-nohup1:
@@ -260,8 +280,8 @@ mt-nohup5:
260280
mt-nohup6:
261281
nice nohup time make mt-all > Logs/ParlaMint-en.6.log &
262282

263-
mt-all: mt-convert
264-
mt-xall-final: mt-convert mt-verts mt-pack mt-web
283+
mt-all: mt-final
284+
mt-xall-final: mt-convert mt-final mt-verts mt-pack mt-web
265285

266286
# Make MT .txt and CoNLL files
267287
mt-convert-txt:
@@ -309,19 +329,30 @@ sanity:
309329
zcat Verts/ParlaMint-XX.${VERSION}.vert.gz | grep -c '</s>'
310330
zcat Verts/ParlaMint-XX-en.${VERSION}.vert.gz | grep -c '</s>'
311331

312-
# Convert from English CoNLL-U + source .TEI.ana -> -en.TEI.ana
313-
mt-convert:
332+
# Convert from English CoNLL-U + source .TEI.ana -> -en.TEI.ana
333+
# We did this, won't probably do it again, except for new corpora!
334+
xx-mt-convert:
314335
for CORPUS in ${CORPORA}; do \
315336
perl Scripts/parlamint-mt2tei.pl \
316337
${HERE}/Distro/ParlaMint-$${CORPUS}.TEI.ana/ParlaMint-$${CORPUS}.ana.xml \
317338
${SOURCES-MT}/ParlaMint-$${CORPUS}-en-notes.tsv \
318339
${SOURCES-MT}/ParlaMint-$${CORPUS}-en.sem \
319-
${TEMP}/ParlaMint-$${CORPUS}-en.TEI.ana 2> Logs/ParlaMint-$${CORPUS}-en.log; \
320-
${FINALIZE-MT} -all -notei -codes $${CORPUS}-en -in ${TEMP} -out ${HERE}/Distro \
321-
2>> Logs/ParlaMint-$${CORPUS}-en.log; \
340+
${SOURCES}/ParlaMint-$${CORPUS}-en.TEI.ana 2> Logs/ParlaMint-$${CORPUS}-en.prepare.log; \
341+
done;
342+
343+
# Finalise -en.TEI.ana corpora
344+
mt-final:
345+
for CORPUS in ${CORPORA}; do \
346+
${FINALIZE-MT} -all -notei -codes $${CORPUS}-en -in ${SOURCES} -out ${HERE}/Distro 2> Logs/ParlaMint-$${CORPUS}-en.log; \
322347
grep -a -i 'fatal' Logs/ParlaMint-$${CORPUS}-en.log > Logs/ParlaMint-$${CORPUS}-en.error.log; \
323348
grep -a -i 'error' Logs/ParlaMint-$${CORPUS}-en.log >> Logs/ParlaMint-$${CORPUS}-en.error.log; \
324349
grep -a -i 'warn' Logs/ParlaMint-$${CORPUS}-en.log > Logs/ParlaMint-$${CORPUS}-en.warn.log; \
350+
echo "$${CORPUS}-en.warn"; \
351+
cat Logs/ParlaMint-$${CORPUS}-en.warn.log | wc -l; \
352+
cat Logs/ParlaMint-$${CORPUS}-en.warn.log | sort | uniq | wc -l; \
353+
echo "$${CORPUS}-en.error"; \
354+
cat Logs/ParlaMint-$${CORPUS}-en.error.log | wc -l; \
355+
cat Logs/ParlaMint-$${CORPUS}-en.error.log | sort | uniq | wc -l; \
325356
done;
326357

327358
### Make CoNLL-U only
@@ -355,7 +386,7 @@ mt-test8:
355386
$s meta=${HERE}/Distro/ParlaMint-UA.TEI.ana/ParlaMint-UA.ana.xml -xsl:../Scripts/validate-parlamint.xsl \
356387
${HERE}/Distro/ParlaMint-UA.TEI.ana/2022/ParlaMint-UA_2022-01-25-m0.ana.xml
357388
mt-test7:
358-
$s meta=${HERE}/Distro/ParlaMint-AT-en.TEI.ana/ParlaMint-AT-en.ana.xml -xsl:../Scripts//check-links.xsl \
389+
$s meta=${HERE}/Distro/ParlaMint-AT-en.TEI.ana/ParlaMint-AT-en.ana.xml -xsl:../Scripts/check-links.xsl \
359390
${HERE}/Distro/ParlaMint-AT-en.TEI.ana/2022/ParlaMint-AT-en_2022-01-20-027-XXVII-NRSITZ-00139.ana.xml
360391
mt-test6:
361392
${FINALIZE-MT} -sample -codes ES-CT-en -out ${HERE}/Distro
@@ -384,7 +415,7 @@ mt-test1:
384415

385416
### Merging taxonomies (now the common taxonomy files should be edited, not merging local copies anymore)
386417

387-
TAXONOMIES-TEI = subcorpus speaker_types parla.legislature
418+
TAXONOMIES-TEI = subcorpus speaker_types parla.legislature topic
388419
TAXONOMIES-ANA = NER sentiment
389420
merge-taxos-nohup:
390421
nohup time make merge-taxos 2> Taxonomies/ParlaMint-taxonomy-merge.log > Logs/ParlaMint-taxonomy.log &
@@ -418,6 +449,9 @@ body:
418449
JAVA-MEMORY=240
419450
JM := $(shell test -n "$(JAVA-MEMORY)" && echo -n "-Xmx$(JAVA-MEMORY)g")
420451

452+
CHUNK-SIZE=100
453+
THREADS=7
454+
421455
P = parallel --citation --gnu --halt 2
422456
#Run java with a large heap, as a complete corpus needs to be read in
423457
s = java -jar $(JM) ../Scripts/bin/saxon.jar

Build/Sources-Distro/registry/Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@
22
all: cp_regis
33

44
SRC=/project/clarinsi-cqp/registry/
5-
REGIST_VERSION=41
5+
REGIST_VERSION=50
66
REGIST_CORPORA=at ba be bg cz dk ee es es_ct es_ga es_pv fi fr gb gr hr hu is it lv nl no pl pt rs se si tr ua xx xx_en
77

88
# Registry files
99
cp_regis:
1010
rm -f registry/*
1111
for CORPUS in ${REGIST_CORPORA}; do \
12-
cp ${SRC}/parlamint${REGIST_VERSION}_$${CORPUS} parlamint${REGIST_VERSION}_$${CORPUS}; \
12+
cp -f ${SRC}/parlamint${REGIST_VERSION}_$${CORPUS} parlamint${REGIST_VERSION}_$${CORPUS}; \
1313
done;

0 commit comments

Comments
 (0)