66
77# ## COMPLETE SET OF CORPORA
88# CORPORA=AT BA BE BG CZ DK EE ES ES-CT ES-GA ES-PV FI FR GB GR HR HU IS IT LV NL NO PL PT RS SE SI TR UA
9- CORPORA =SI
9+ CORPORA =AT
1010# Used in targets that run only for one corpus
1111CORPUS =
1212
@@ -28,6 +28,11 @@ HANDLE-TEI = http://hdl.handle.net/11356/2004
2828HANDLE-ANA = http://hdl.handle.net/11356/2005
2929HANDLE-MT = http://hdl.handle.net/11356/2006
3030
31+ # For IL only:
32+ # VERSION = 1.0
33+ # HANDLE-TEI = http://hdl.handle.net/11356/2032
34+ # HANDLE-ANA = http://hdl.handle.net/11356/2032
35+
3136# Where the produced corpora are put for inspection
3237WEB = tomaz@nl.ijs.si:/home/tomaz/www/tmp/ParlaMint
3338
@@ -73,8 +78,8 @@ test-tei1:
7378test-vert6 :
7479 ../Scripts/parlamintp-tei2vert-xx.pl ${HERE} /Distro/ParlaMint-LV.TEI.ana Test/ParlaMint-LV-xx.vert
7580test-vert5 :
76- $s meta=../Build/Distro/ParlaMint-DK .TEI.ana/ParlaMint-DK .ana.xml -xsl:../Scripts/parlamint2xmlvert.xsl \
77- ../Build/Distro/ParlaMint-DK .TEI.ana/2020/ParlaMint-DK_2020-01-07-20191-M42 .ana.xml > Test/test-DK .vert
81+ $s meta=../Build/Distro/ParlaMint-SI .TEI.ana/ParlaMint-SI .ana.xml -xsl:../Scripts/parlamint2xmlvert.xsl \
82+ ../Build/Distro/ParlaMint-SI .TEI.ana/2020/ParlaMint-SI_2020-03-13-SDZ8-Izredna-31 .ana.xml > Test/test-SI .vert
7883test-vert4 :
7984 $s meta=${HERE} /Distro/ParlaMint-SI.TEI.ana/ParlaMint-SI.ana.xml -xsl:../Scripts/parlamint2xmlvert.xsl \
8085 ${HERE} /Distro/ParlaMint-SI.TEI.ana/2022/ParlaMint-SI_2022-01-13-SDZ8-Izredna-93.ana.xml > test-SI.vert
@@ -83,22 +88,27 @@ test-vert2:
8388test-vert1 :
8489 ${FINALIZE} -vert -codes GR -in ${HERE} /Temp -out ${HERE} /Temp
8590 ${FINALIZE} -vert -codes GR-en -in ${HERE} /Temp -out ${HERE} /Temp
91+ test-conll5 :
92+ ../Scripts/parlamintp2conllu.pl -jobs 1 -in ../Build/Sources-TEI/ParlaMint-AT.TEI.ana -out ../Build/Test
8693test-conll4 :
87- $s meta=../Build/Distro/ParlaMint-AT .TEI.ana/ParlaMint-AT .ana.xml \
88- -xsl:../Scripts/parlamint2conllu.xsl ../Build/Distro/ParlaMint-AT .TEI.ana/1996 /ParlaMint-AT_1996-01-15-020-XX-NRSITZ-00003 .ana.xml \
89- > Test/test-AT .conllu
94+ $s meta=../Build/Distro/ParlaMint-SI .TEI.ana/ParlaMint-SI .ana.xml \
95+ -xsl:../Scripts/parlamint2conllu.xsl ../Build/Distro/ParlaMint-SI .TEI.ana/2000 /ParlaMint-SI_2000-10-27-SDZ3-Redna-01 .ana.xml \
96+ > Test/test-SI .conllu
9097test-conll3 :
9198 ${FINALIZE} -conll -codes GR-en -in ${HERE} /Temp -out ${HERE} /Temp
9299test-conll2 :
93100 ${FINALIZE} -conll -codes GR -in ${HERE} /Temp -out ${HERE} /Temp
94101test-conll1 :
95102 ${FINALIZE} -conll -codes UA -in ${HERE} /Distro -out ${HERE} /Distro
103+ test-meta6 :
104+ $s out-lang=en meta=../Build/Sources-TEI/ParlaMint-FR.TEI.ana/ParlaMint-FR.ana.xml -xsl:../Scripts/parlamint2meta.ana.xsl \
105+ ../Build/Sources-TEI/ParlaMint-FR.TEI.ana/2020/ParlaMint-FR_2020-01-07-O1114.ana.xml > Test/test.ana-meta-en.tsv
96106test-meta5 :
97- $s out-lang=en meta=../Build/Distro/ParlaMint-SE .TEI/ParlaMint-SE .xml -xsl:../Scripts/parlamint2meta.xsl \
98- ../Build/Distro/ParlaMint-SE .TEI/2015 /ParlaMint-SE_2015-11-04-prot-201516--21 .xml > test.tsv
107+ $s out-lang=en meta=../Build/Distro/ParlaMint-SI .TEI/ParlaMint-SI .xml -xsl:../Scripts/parlamint2meta.xsl \
108+ ../Build/Distro/ParlaMint-SI .TEI/2000 /ParlaMint-SI_2000-10-27-SDZ3-Redna-01 .xml > Test/ test.tsv
99109test-meta4 :
100- $s out-lang=en meta=../Build/Distro/ParlaMint-PT.TEI /ParlaMint-PT .xml -xsl:../Scripts/parlamint2meta.xsl \
101- ../Build/Distro/ParlaMint-PT.TEI/2022 /ParlaMint-PT_2022-03-22. xml > test.tsv
110+ $s out-lang=xx meta=../Build/Distro/ParlaMint-SI /ParlaMint-SI .xml -xsl:../Scripts/parlamint2meta.xsl \
111+ ../Build/Distro/ParlaMint-SI/2007 /ParlaMint-SI_2007-11-28-SDZ4-Izredna-30.ana. xml > Test/ test.tsv
102112test-meta3 :
103113 $s out-lang=en meta=../Build/Distro/ParlaMint-RS.TEI/ParlaMint-RS.xml -xsl:../Scripts/parlamint2meta.xsl \
104114 ../Build/Distro/ParlaMint-RS.TEI/2016/ParlaMint-RS_2016-06-03-0.xml > test.tsv
@@ -111,7 +121,17 @@ test-meta1:
111121 ${FINALIZE} -txt -codes IS -in ${HERE} /Temp -out ${HERE} /Temp
112122 # ${FINALIZE} -txt -codes GR-en -in ${HERE}/Temp -out ${HERE}/Temp
113123test-text1 :
114- ${FINALIZE} -txt -codes ES-CT -in ${HERE} /Distro -out ${HERE} /Distro
124+ ${FINALIZE} -txt -codes ES -in ${HERE} /Distro -out ${HERE} /Distro
125+ test-text2 :
126+ ../Scripts/parlamintp-tei2text.pl -jobs 2 -in ../Build/Distro/ParlaMint-SI -out ../Build/Distro/ParlaMint-SI
127+ test-valid1 :
128+ ${FINALIZE} -valid -codes SI -in ${HERE} /Distro -out ${HERE} /Distro
129+ test-fix1 :
130+ $s anaDir=../Build/Distro/ParlaMint-RS outDir=Test -xsl:../Scripts/parlamint2release.xsl \
131+ ../Build/Distro/ParlaMint-RS/ParlaMint-RS.xml
132+ test-fix2 :
133+ $s anaDir=../Build/Distro/ParlaMint-AT outDir=Test -xsl:../Scripts/parlamint2release.xsl \
134+ ../Build/Distro/ParlaMint-AT/ParlaMint-AT.xml
115135
116136# ## Fixes
117137# Merge per-language translated CoNLL-Us (BE, ES-CT, ES-PV, UA) to joint CoNLL-U (with # lang info on newpar)
205225 rsync -av Packed/* .tgz ${WEB} /Repo
206226
207227# ##### Targets for producing releasable version of ParlaMint corpora
208- FINALIZE = perl ../Scripts/parlamint2distro.pl -version ${VERSION} -teihandle ${HANDLE-TEI} -anahandle ${HANDLE-ANA} -schema ../Schema -docs Sources-Distro
228+ FINALIZE = perl ../Scripts/parlamint2distro.pl -version ${VERSION} -teihandle ${HANDLE-TEI} -anahandle ${HANDLE-ANA} -schema ../Schema -docs Sources-Distro -procMemGB ${JAVA-MEMORY} -procChunkSize ${CHUNK-SIZE} -procThreads ${THREADS}
209229
210230# ## For real
211231# More than one nohup, in case we want several runs at once
@@ -217,7 +237,7 @@ nohup2:
217237nohup3 :
218238 nice nohup time make all > Logs/ParlaMint.3.log &
219239
220- all : final
240+ all : final verts
221241xall : final verts pack
222242
223243pack :
@@ -244,7 +264,7 @@ final:
244264# ## Make MTed corpora
245265
246266# Make distribution with:
247- FINALIZE-MT =perl ../Scripts/parlamint2distro.pl -version ${VERSION} -anahandle ${HANDLE-MT} -schema ${PARLAMINT}/Schema -docs ${HERE}/Sources-Distro
267+ FINALIZE-MT =perl ../Scripts/parlamint2distro.pl -version ${VERSION} -anahandle ${HANDLE-MT} -schema ${PARLAMINT}/Schema -docs ${HERE}/Sources-Distro -procMemGB ${JAVA-MEMORY} -procChunkSize ${CHUNK-SIZE} -procThreads ${THREADS}
248268
249269# Targets
250270mt-nohup1 :
@@ -260,8 +280,8 @@ mt-nohup5:
260280mt-nohup6 :
261281 nice nohup time make mt-all > Logs/ParlaMint-en.6.log &
262282
263- mt-all : mt-convert
264- mt-xall-final : mt-convert mt-verts mt-pack mt-web
283+ mt-all : mt-final
284+ mt-xall-final : mt-convert mt-final mt- verts mt-pack mt-web
265285
266286# Make MT .txt and CoNLL files
267287mt-convert-txt :
@@ -309,19 +329,30 @@ sanity:
309329 zcat Verts/ParlaMint-XX.${VERSION} .vert.gz | grep -c ' </s>'
310330 zcat Verts/ParlaMint-XX-en.${VERSION} .vert.gz | grep -c ' </s>'
311331
312- # Convert from English CoNLL-U + source .TEI.ana -> -en.TEI.ana
313- mt-convert :
332+ # Convert from English CoNLL-U + source .TEI.ana -> -en.TEI.ana
333+ # We did this, won't probably do it again, except for new corpora!
334+ xx-mt-convert :
314335 for CORPUS in ${CORPORA} ; do \
315336 perl Scripts/parlamint-mt2tei.pl \
316337 ${HERE} /Distro/ParlaMint-$$ {CORPUS}.TEI.ana/ParlaMint-$$ {CORPUS}.ana.xml \
317338 ${SOURCES-MT} /ParlaMint-$$ {CORPUS}-en-notes.tsv \
318339 ${SOURCES-MT} /ParlaMint-$$ {CORPUS}-en.sem \
319- ${TEMP} /ParlaMint-$$ {CORPUS}-en.TEI.ana 2> Logs/ParlaMint-$$ {CORPUS}-en.log; \
320- ${FINALIZE-MT} -all -notei -codes $$ {CORPUS}-en -in ${TEMP} -out ${HERE} /Distro \
321- 2>> Logs/ParlaMint-$$ {CORPUS}-en.log; \
340+ ${SOURCES} /ParlaMint-$$ {CORPUS}-en.TEI.ana 2> Logs/ParlaMint-$$ {CORPUS}-en.prepare.log; \
341+ done ;
342+
343+ # Finalise -en.TEI.ana corpora
344+ mt-final :
345+ for CORPUS in ${CORPORA} ; do \
346+ ${FINALIZE-MT} -all -notei -codes $$ {CORPUS}-en -in ${SOURCES} -out ${HERE} /Distro 2> Logs/ParlaMint-$$ {CORPUS}-en.log; \
322347 grep -a -i ' fatal' Logs/ParlaMint-$$ {CORPUS}-en.log > Logs/ParlaMint-$$ {CORPUS}-en.error.log; \
323348 grep -a -i ' error' Logs/ParlaMint-$$ {CORPUS}-en.log >> Logs/ParlaMint-$$ {CORPUS}-en.error.log; \
324349 grep -a -i ' warn' Logs/ParlaMint-$$ {CORPUS}-en.log > Logs/ParlaMint-$$ {CORPUS}-en.warn.log; \
350+ echo " $$ {CORPUS}-en.warn" ; \
351+ cat Logs/ParlaMint-$$ {CORPUS}-en.warn.log | wc -l; \
352+ cat Logs/ParlaMint-$$ {CORPUS}-en.warn.log | sort | uniq | wc -l; \
353+ echo " $$ {CORPUS}-en.error" ; \
354+ cat Logs/ParlaMint-$$ {CORPUS}-en.error.log | wc -l; \
355+ cat Logs/ParlaMint-$$ {CORPUS}-en.error.log | sort | uniq | wc -l; \
325356 done ;
326357
327358# ## Make CoNLL-U only
@@ -355,7 +386,7 @@ mt-test8:
355386 $s meta=${HERE} /Distro/ParlaMint-UA.TEI.ana/ParlaMint-UA.ana.xml -xsl:../Scripts/validate-parlamint.xsl \
356387 ${HERE} /Distro/ParlaMint-UA.TEI.ana/2022/ParlaMint-UA_2022-01-25-m0.ana.xml
357388mt-test7 :
358- $s meta=${HERE} /Distro/ParlaMint-AT-en.TEI.ana/ParlaMint-AT-en.ana.xml -xsl:../Scripts// check-links.xsl \
389+ $s meta=${HERE} /Distro/ParlaMint-AT-en.TEI.ana/ParlaMint-AT-en.ana.xml -xsl:../Scripts/check-links.xsl \
359390 ${HERE} /Distro/ParlaMint-AT-en.TEI.ana/2022/ParlaMint-AT-en_2022-01-20-027-XXVII-NRSITZ-00139.ana.xml
360391mt-test6 :
361392 ${FINALIZE-MT} -sample -codes ES-CT-en -out ${HERE} /Distro
@@ -384,7 +415,7 @@ mt-test1:
384415
385416# ## Merging taxonomies (now the common taxonomy files should be edited, not merging local copies anymore)
386417
387- TAXONOMIES-TEI = subcorpus speaker_types parla.legislature
418+ TAXONOMIES-TEI = subcorpus speaker_types parla.legislature topic
388419TAXONOMIES-ANA = NER sentiment
389420merge-taxos-nohup :
390421 nohup time make merge-taxos 2> Taxonomies/ParlaMint-taxonomy-merge.log > Logs/ParlaMint-taxonomy.log &
@@ -418,6 +449,9 @@ body:
418449JAVA-MEMORY =240
419450JM := $(shell test -n "$(JAVA-MEMORY ) " && echo -n "-Xmx$(JAVA-MEMORY ) g")
420451
452+ CHUNK-SIZE =100
453+ THREADS =7
454+
421455P = parallel --citation --gnu --halt 2
422456# Run java with a large heap, as a complete corpus needs to be read in
423457s = java -jar $(JM ) ../Scripts/bin/saxon.jar
0 commit comments