@@ -5,121 +5,130 @@ import scala.collection.mutable
55import org .dbpedia .extraction .mappings ._
66import org .dbpedia .extraction .util .StringUtils .prettyMillis
77import org .dbpedia .extraction .wikiparser .{Namespace ,TemplateNode }
8+ import org .dbpedia .extraction .wikiparser .impl .wikipedia .Namespaces
89import MappingStats .InvalidTarget
910
1011object MappingStatsHolder {
11-
1212 private val logger = Logger .getLogger(getClass.getName)
1313
1414 def apply (wikiStats : WikipediaStats , mappings : Mappings , ignoreList : IgnoreList ): MappingStatsHolder = {
15-
16- val language = wikiStats.language
17-
18- val millis = System .currentTimeMillis
19- logger.info(" Updating " + language.wikiCode+ " mapped statistics" )
20-
21- val templateMappings = mappings.templateMappings
22-
23- var statistics = new mutable.ArrayBuffer [MappingStats ]()
24-
25- val templateNamespace = Namespace .Template .name(language) + " :"
26-
27- for ((rawTemplate, templateStats) <- wikiStats.templates)
28- {
29- if (rawTemplate startsWith templateNamespace) {
30-
31- val templateName = rawTemplate.substring(templateNamespace.length)
32- val isMapped = templateMappings.contains(templateName)
33- val mappedProps =
34- if (isMapped) new PropertyCollector (templateMappings(templateName)).properties
35- else Set .empty[String ]
36-
37- var properties = new mutable.HashMap [String , (Int , Boolean )]
38-
39- for ((name, count) <- templateStats.properties) {
40- properties(name) = (count, mappedProps.contains(name))
41- }
42-
43- for (name <- mappedProps) {
44- if (! properties.contains(name)) properties(name) = (InvalidTarget , true )
45- }
46-
47- statistics += new MappingStats (templateStats, templateName, isMapped, properties.toMap, ignoreList)
48-
49- } else {
50- logger.warning(language.wikiCode+ " template '" + rawTemplate+ " ' does not start with '" + templateNamespace+ " '" )
15+ val language = wikiStats.language
16+
17+ val millis = System .currentTimeMillis
18+ logger.info(" Updating " + language.wikiCode + " mapped statistics" )
19+
20+ val templateMappings = mappings.templateMappings
21+
22+ var statistics = new mutable.ArrayBuffer [MappingStats ]()
23+
24+ // Default template namespace name for the language
25+ val templateNamespace = Namespace .Template .name(language) + " :"
26+
27+ // Build all valid namespace prefixes for Template namespace (code 10)
28+ // Handles languages like Macedonian that expose multiple valid prefixes
29+ val validTemplatePrefixes = Namespaces .names(language)
30+ .filter(_._2 == 10 ) // Template namespace code is 10
31+ .keys
32+ .map(_ + " :" )
33+ .toSet + templateNamespace
34+
35+ for ((rawTemplate, templateStats) <- wikiStats.templates) {
36+ val matchedPrefix = validTemplatePrefixes.find(rawTemplate.startsWith)
37+
38+ if (matchedPrefix.isDefined) {
39+ val templateName = rawTemplate.substring(matchedPrefix.get.length)
40+ val isMapped = templateMappings.contains(templateName)
41+ val mappedProps =
42+ if (isMapped) new PropertyCollector (templateMappings(templateName)).properties
43+ else Set .empty[String ]
44+
45+ var properties = new mutable.HashMap [String , (Int , Boolean )]
46+
47+ for ((name, count) <- templateStats.properties) {
48+ properties(name) = (count, mappedProps.contains(name))
49+ }
50+
51+ for (name <- mappedProps) {
52+ if (! properties.contains(name)) properties(name) = (InvalidTarget , true )
5153 }
54+
55+ statistics += new MappingStats (templateStats, templateName, isMapped, properties.toMap, ignoreList)
56+ } else {
57+ logger.warning(language.wikiCode + " template '" + rawTemplate + " ' does not start with any valid template namespace prefix" )
5258 }
53-
54- val redirects = wikiStats.redirects.filterKeys(title => templateMappings.contains(title.substring(templateNamespace.length))).map(_.swap)
55-
56- val holder = new MappingStatsHolder (mappings, statistics.toList, redirects, ignoreList)
57-
58- logger.info(" Updated " + language.wikiCode+ " mapped statistics in " + prettyMillis(System .currentTimeMillis - millis))
59-
60- holder
59+ }
60+
61+ val redirects = wikiStats.redirects
62+ .filterKeys { title =>
63+ val matchedPrefix = validTemplatePrefixes.find(title.startsWith)
64+ matchedPrefix.isDefined && templateMappings.contains(title.substring(matchedPrefix.get.length))
65+ }
66+ .map(_.swap)
67+
68+ val holder = new MappingStatsHolder (mappings, statistics.toList, redirects, ignoreList)
69+
70+ logger.info(" Updated " + language.wikiCode + " mapped statistics in " + prettyMillis(System .currentTimeMillis - millis))
71+
72+ holder
6173 }
62-
6374}
6475
6576/**
6677 * Contains statistics data computed from Wikipedia statistics numbers and template mappings.
67- * Also holds on to the mappings to make synchronization in MappingStatsManager easier.
78+ * Also holds on to the mappings to make synchronization in MappingStatsManager easier.
6879 * TODO: better solution for mappings?
6980 */
7081class MappingStatsHolder (val mappings : Mappings , val mappedStatistics : List [MappingStats ], val reversedRedirects : Map [String , String ], ignoreList : IgnoreList ) {
71-
72- private def countTemplates (all : Boolean , count : MappingStats => Int ): Int = {
73- var sum = 0
74- for (ms <- mappedStatistics) {
75- if (all || ms.isMapped) {
76- if (! ignoreList.isTemplateIgnored(ms.templateName)) {
77- sum += count(ms)
78- }
82+ private def countTemplates (all : Boolean , count : MappingStats => Int ): Int = {
83+ var sum = 0
84+ for (ms <- mappedStatistics) {
85+ if (all || ms.isMapped) {
86+ if (! ignoreList.isTemplateIgnored(ms.templateName)) {
87+ sum += count(ms)
7988 }
8089 }
81- sum
8290 }
91+ sum
92+ }
93+
94+ private def countAllTemplates (count : MappingStats => Int ): Int = countTemplates(true , count)
95+ private def countMappedTemplates (count : MappingStats => Int ): Int = countTemplates(false , count)
96+
97+ val templateCount = countAllTemplates(_ => 1 )
98+ val mappedTemplateCount = countMappedTemplates(_ => 1 )
99+
100+ val templateUseCount = countAllTemplates(_.templateCount)
101+ val mappedTemplateUseCount = countMappedTemplates(_.templateCount)
102+
103+ val propertyCount = countAllTemplates(_.propertyCount)
104+ val mappedPropertyCount = countMappedTemplates(_.mappedPropertyCount)
83105
84- private def countAllTemplates (count : MappingStats => Int ): Int = countTemplates(true , count)
85- private def countMappedTemplates (count : MappingStats => Int ): Int = countTemplates(false , count)
86-
87- val templateCount = countAllTemplates(_ => 1 )
88- val mappedTemplateCount = countMappedTemplates(_ => 1 )
89-
90- val templateUseCount = countAllTemplates(_.templateCount)
91- val mappedTemplateUseCount = countMappedTemplates(_.templateCount)
92-
93- val propertyCount = countAllTemplates(_.propertyCount)
94- val mappedPropertyCount = countMappedTemplates(_.mappedPropertyCount)
95-
96- val propertyUseCount = countAllTemplates(_.propertyUseCount)
97- val mappedPropertyUseCount = countMappedTemplates(_.mappedPropertyUseCount)
98-
99- val mappedTemplateRatio = mappedTemplateCount.toDouble / templateCount.toDouble
100- val mappedPropertyRatio = mappedPropertyCount.toDouble / propertyCount.toDouble
101-
102- val mappedTemplateUseRatio = mappedTemplateUseCount.toDouble / templateUseCount.toDouble
103- val mappedPropertyUseRatio = mappedPropertyUseCount.toDouble / propertyUseCount.toDouble
106+ val propertyUseCount = countAllTemplates(_.propertyUseCount)
107+ val mappedPropertyUseCount = countMappedTemplates(_.mappedPropertyUseCount)
108+
109+ val mappedTemplateRatio = mappedTemplateCount.toDouble / templateCount.toDouble
110+ val mappedPropertyRatio = mappedPropertyCount.toDouble / propertyCount.toDouble
111+
112+ val mappedTemplateUseRatio = mappedTemplateUseCount.toDouble / templateUseCount.toDouble
113+ val mappedPropertyUseRatio = mappedPropertyUseCount.toDouble / propertyUseCount.toDouble
104114}
105115
106116class PropertyCollector (mapping : Extractor [TemplateNode ]) {
107-
108117 val properties = new mutable.HashSet [String ]
109-
118+
110119 classMapping(mapping) // go get'em!
111-
112- private def classMapping (mapping : Extractor [TemplateNode ]) : Unit = mapping match {
120+
121+ private def classMapping (mapping : Extractor [TemplateNode ]): Unit = mapping match {
113122 case tm : TemplateMapping => tm.mappings.foreach(propertyMapping)
114123 case cm : ConditionalMapping =>
115124 cm.cases.foreach(conditionMapping)
116125 cm.defaultMappings.foreach(propertyMapping)
117126 }
118-
119- private def conditionMapping (mapping : ConditionMapping ) : Unit =
127+
128+ private def conditionMapping (mapping : ConditionMapping ): Unit =
120129 classMapping(mapping.mapping)
121-
122- private def propertyMapping (mapping : PropertyMapping ) : Unit = mapping match {
130+
131+ private def propertyMapping (mapping : PropertyMapping ): Unit = mapping match {
123132 case m : SimplePropertyMapping => this + m.templateProperty
124133 case m : GeoCoordinatesMapping => this + m.coordinates + m.latitude + m.longitude + m.longitudeDegrees + m.longitudeMinutes + m.longitudeSeconds + m.longitudeDirection + m.latitudeDegrees + m.latitudeMinutes + m.latitudeSeconds + m.latitudeDirection
125134 case m : CalculateMapping => this + m.templateProperty1 + m.templateProperty2
@@ -128,8 +137,8 @@ class PropertyCollector(mapping: Extractor[TemplateNode]) {
128137 case m : IntermediateNodeMapping => m.mappings.foreach(propertyMapping)
129138 case m : ConstantMapping => // ignore
130139 }
131-
132- private def + (name : String ) : PropertyCollector = {
140+
141+ private def + (name : String ): PropertyCollector = {
133142 if (name != null ) properties.add(name)
134143 this
135144 }
0 commit comments