Skip to content

Commit df0b449

Browse files
Fix #804: Support multiple template namespace prefixes for Macedonian using wsl
1 parent b88ab6b commit df0b449

File tree

1 file changed

+95
-86
lines changed

1 file changed

+95
-86
lines changed

server/src/main/scala/org/dbpedia/extraction/server/stats/MappingStatsHolder.scala

Lines changed: 95 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -5,121 +5,130 @@ import scala.collection.mutable
55
import org.dbpedia.extraction.mappings._
66
import org.dbpedia.extraction.util.StringUtils.prettyMillis
77
import org.dbpedia.extraction.wikiparser.{Namespace,TemplateNode}
8+
import org.dbpedia.extraction.wikiparser.impl.wikipedia.Namespaces
89
import MappingStats.InvalidTarget
910

1011
object MappingStatsHolder {
11-
1212
private val logger = Logger.getLogger(getClass.getName)
1313

1414
def apply(wikiStats: WikipediaStats, mappings: Mappings, ignoreList: IgnoreList): MappingStatsHolder = {
15-
16-
val language = wikiStats.language
17-
18-
val millis = System.currentTimeMillis
19-
logger.info("Updating "+language.wikiCode+" mapped statistics")
20-
21-
val templateMappings = mappings.templateMappings
22-
23-
var statistics = new mutable.ArrayBuffer[MappingStats]()
24-
25-
val templateNamespace = Namespace.Template.name(language) + ":"
26-
27-
for ((rawTemplate, templateStats) <- wikiStats.templates)
28-
{
29-
if (rawTemplate startsWith templateNamespace) {
30-
31-
val templateName = rawTemplate.substring(templateNamespace.length)
32-
val isMapped = templateMappings.contains(templateName)
33-
val mappedProps =
34-
if (isMapped) new PropertyCollector(templateMappings(templateName)).properties
35-
else Set.empty[String]
36-
37-
var properties = new mutable.HashMap[String, (Int, Boolean)]
38-
39-
for ((name, count) <- templateStats.properties) {
40-
properties(name) = (count, mappedProps.contains(name))
41-
}
42-
43-
for (name <- mappedProps) {
44-
if (! properties.contains(name)) properties(name) = (InvalidTarget, true)
45-
}
46-
47-
statistics += new MappingStats(templateStats, templateName, isMapped, properties.toMap, ignoreList)
48-
49-
} else {
50-
logger.warning(language.wikiCode+" template '"+rawTemplate+"' does not start with '"+templateNamespace+"'")
15+
val language = wikiStats.language
16+
17+
val millis = System.currentTimeMillis
18+
logger.info("Updating " + language.wikiCode + " mapped statistics")
19+
20+
val templateMappings = mappings.templateMappings
21+
22+
var statistics = new mutable.ArrayBuffer[MappingStats]()
23+
24+
// Default template namespace name for the language
25+
val templateNamespace = Namespace.Template.name(language) + ":"
26+
27+
// Build all valid namespace prefixes for Template namespace (code 10)
28+
// Handles languages like Macedonian that expose multiple valid prefixes
29+
val validTemplatePrefixes = Namespaces.names(language)
30+
.filter(_._2 == 10) // Template namespace code is 10
31+
.keys
32+
.map(_ + ":")
33+
.toSet + templateNamespace
34+
35+
for ((rawTemplate, templateStats) <- wikiStats.templates) {
36+
val matchedPrefix = validTemplatePrefixes.find(rawTemplate.startsWith)
37+
38+
if (matchedPrefix.isDefined) {
39+
val templateName = rawTemplate.substring(matchedPrefix.get.length)
40+
val isMapped = templateMappings.contains(templateName)
41+
val mappedProps =
42+
if (isMapped) new PropertyCollector(templateMappings(templateName)).properties
43+
else Set.empty[String]
44+
45+
var properties = new mutable.HashMap[String, (Int, Boolean)]
46+
47+
for ((name, count) <- templateStats.properties) {
48+
properties(name) = (count, mappedProps.contains(name))
49+
}
50+
51+
for (name <- mappedProps) {
52+
if (!properties.contains(name)) properties(name) = (InvalidTarget, true)
5153
}
54+
55+
statistics += new MappingStats(templateStats, templateName, isMapped, properties.toMap, ignoreList)
56+
} else {
57+
logger.warning(language.wikiCode + " template '" + rawTemplate + "' does not start with any valid template namespace prefix")
5258
}
53-
54-
val redirects = wikiStats.redirects.filterKeys(title => templateMappings.contains(title.substring(templateNamespace.length))).map(_.swap)
55-
56-
val holder = new MappingStatsHolder(mappings, statistics.toList, redirects, ignoreList)
57-
58-
logger.info("Updated "+language.wikiCode+" mapped statistics in "+prettyMillis(System.currentTimeMillis - millis))
59-
60-
holder
59+
}
60+
61+
val redirects = wikiStats.redirects
62+
.filterKeys { title =>
63+
val matchedPrefix = validTemplatePrefixes.find(title.startsWith)
64+
matchedPrefix.isDefined && templateMappings.contains(title.substring(matchedPrefix.get.length))
65+
}
66+
.map(_.swap)
67+
68+
val holder = new MappingStatsHolder(mappings, statistics.toList, redirects, ignoreList)
69+
70+
logger.info("Updated " + language.wikiCode + " mapped statistics in " + prettyMillis(System.currentTimeMillis - millis))
71+
72+
holder
6173
}
62-
6374
}
6475

6576
/**
6677
* Contains statistics data computed from Wikipedia statistics numbers and template mappings.
67-
* Also holds on to the mappings to make synchronization in MappingStatsManager easier.
78+
* Also holds on to the mappings to make synchronization in MappingStatsManager easier.
6879
* TODO: better solution for mappings?
6980
*/
7081
class MappingStatsHolder(val mappings: Mappings, val mappedStatistics: List[MappingStats], val reversedRedirects: Map[String, String], ignoreList: IgnoreList) {
71-
72-
private def countTemplates(all: Boolean, count: MappingStats => Int): Int = {
73-
var sum = 0
74-
for (ms <- mappedStatistics) {
75-
if (all || ms.isMapped) {
76-
if (! ignoreList.isTemplateIgnored(ms.templateName)) {
77-
sum += count(ms)
78-
}
82+
private def countTemplates(all: Boolean, count: MappingStats => Int): Int = {
83+
var sum = 0
84+
for (ms <- mappedStatistics) {
85+
if (all || ms.isMapped) {
86+
if (!ignoreList.isTemplateIgnored(ms.templateName)) {
87+
sum += count(ms)
7988
}
8089
}
81-
sum
8290
}
91+
sum
92+
}
93+
94+
private def countAllTemplates(count: MappingStats => Int): Int = countTemplates(true, count)
95+
private def countMappedTemplates(count: MappingStats => Int): Int = countTemplates(false, count)
96+
97+
val templateCount = countAllTemplates(_ => 1)
98+
val mappedTemplateCount = countMappedTemplates(_ => 1)
99+
100+
val templateUseCount = countAllTemplates(_.templateCount)
101+
val mappedTemplateUseCount = countMappedTemplates(_.templateCount)
102+
103+
val propertyCount = countAllTemplates(_.propertyCount)
104+
val mappedPropertyCount = countMappedTemplates(_.mappedPropertyCount)
83105

84-
private def countAllTemplates(count: MappingStats => Int): Int = countTemplates(true, count)
85-
private def countMappedTemplates(count: MappingStats => Int): Int = countTemplates(false, count)
86-
87-
val templateCount = countAllTemplates(_ => 1)
88-
val mappedTemplateCount = countMappedTemplates(_ => 1)
89-
90-
val templateUseCount = countAllTemplates(_.templateCount)
91-
val mappedTemplateUseCount = countMappedTemplates(_.templateCount)
92-
93-
val propertyCount = countAllTemplates(_.propertyCount)
94-
val mappedPropertyCount = countMappedTemplates(_.mappedPropertyCount)
95-
96-
val propertyUseCount = countAllTemplates(_.propertyUseCount)
97-
val mappedPropertyUseCount = countMappedTemplates(_.mappedPropertyUseCount)
98-
99-
val mappedTemplateRatio = mappedTemplateCount.toDouble / templateCount.toDouble
100-
val mappedPropertyRatio = mappedPropertyCount.toDouble / propertyCount.toDouble
101-
102-
val mappedTemplateUseRatio = mappedTemplateUseCount.toDouble / templateUseCount.toDouble
103-
val mappedPropertyUseRatio = mappedPropertyUseCount.toDouble / propertyUseCount.toDouble
106+
val propertyUseCount = countAllTemplates(_.propertyUseCount)
107+
val mappedPropertyUseCount = countMappedTemplates(_.mappedPropertyUseCount)
108+
109+
val mappedTemplateRatio = mappedTemplateCount.toDouble / templateCount.toDouble
110+
val mappedPropertyRatio = mappedPropertyCount.toDouble / propertyCount.toDouble
111+
112+
val mappedTemplateUseRatio = mappedTemplateUseCount.toDouble / templateUseCount.toDouble
113+
val mappedPropertyUseRatio = mappedPropertyUseCount.toDouble / propertyUseCount.toDouble
104114
}
105115

106116
class PropertyCollector(mapping: Extractor[TemplateNode]) {
107-
108117
val properties = new mutable.HashSet[String]
109-
118+
110119
classMapping(mapping) // go get'em!
111-
112-
private def classMapping(mapping: Extractor[TemplateNode]) : Unit = mapping match {
120+
121+
private def classMapping(mapping: Extractor[TemplateNode]): Unit = mapping match {
113122
case tm: TemplateMapping => tm.mappings.foreach(propertyMapping)
114123
case cm: ConditionalMapping =>
115124
cm.cases.foreach(conditionMapping)
116125
cm.defaultMappings.foreach(propertyMapping)
117126
}
118-
119-
private def conditionMapping(mapping: ConditionMapping) : Unit =
127+
128+
private def conditionMapping(mapping: ConditionMapping): Unit =
120129
classMapping(mapping.mapping)
121-
122-
private def propertyMapping(mapping: PropertyMapping) : Unit = mapping match {
130+
131+
private def propertyMapping(mapping: PropertyMapping): Unit = mapping match {
123132
case m: SimplePropertyMapping => this + m.templateProperty
124133
case m: GeoCoordinatesMapping => this + m.coordinates + m.latitude + m.longitude + m.longitudeDegrees + m.longitudeMinutes + m.longitudeSeconds + m.longitudeDirection + m.latitudeDegrees + m.latitudeMinutes + m.latitudeSeconds + m.latitudeDirection
125134
case m: CalculateMapping => this + m.templateProperty1 + m.templateProperty2
@@ -128,8 +137,8 @@ class PropertyCollector(mapping: Extractor[TemplateNode]) {
128137
case m: IntermediateNodeMapping => m.mappings.foreach(propertyMapping)
129138
case m: ConstantMapping => // ignore
130139
}
131-
132-
private def +(name: String) : PropertyCollector = {
140+
141+
private def +(name: String): PropertyCollector = {
133142
if (name != null) properties.add(name)
134143
this
135144
}

0 commit comments

Comments
 (0)