SELECT
series,
min(ordinal) as minOrdinal,
max(ordinal) as maxOrdinal,
avg(ordinal) as avgOrdinal,
max(Ordinal)-min(Ordinal) as available,
(max(Ordinal)-min(Ordinal)) /(max(Ordinal)-1.0) as completeness
FROM event_dblp
Where ordinal is not null
group by series
order by 6 desc
SELECT
series,
min(ordinal) as minOrdinal,
max(ordinal) as maxOrdinal,
avg(ordinal) as avgOrdinal,
max(Ordinal)-min(Ordinal) as available,
(max(Ordinal)-min(Ordinal)) /(max(Ordinal)-1.0) as completeness
FROM event_dblp
Where ordinal is not null
group by series
order by 6 desc
see also http://conferencecorpus.bitplan.com/query/SignatureAvailability
[[File:completeSignature_country.png|600px]
To few available ordinals for analysis.
not enough data
SELECT
series,
min(ordinal) as minOrdinal,
max(ordinal) as maxOrdinal,
avg(ordinal) as avgOrdinal,
max(Ordinal)-min(Ordinal) as ordinalRange,
(max(Ordinal)-min(Ordinal)) /(max(Ordinal)-1.0) as completeness
FROM event_dblp
WHERE ordinal is not null
GROUP BY series
ORDER by 6 DESC
SELECT
inEventSeries,
min(ordinal) as minOrdinal,
max(ordinal) as maxOrdinal,
avg(ordinal) as avgOrdinal,
max(Ordinal)-min(Ordinal) as ordinalRange,
(max(Ordinal)-min(Ordinal)) /(max(Ordinal)-1.0) as completeness
FROM event_orclone
WHERE ordinal is not null
GROUP BY inEventSeries
ORDER by 6 DESC
SELECT
seriesId,
min(ordinal) as minOrdinal,
max(ordinal) as maxOrdinal,
avg(ordinal) as avgOrdinal,
max(Ordinal)-min(Ordinal) as ordinalRange,
(max(Ordinal)-min(Ordinal)) /(max(Ordinal)-1.0) as completeness
FROM event_wikicfp
WHERE ordinal is not null
GROUP BY seriesId
ORDER by 6 DESC
SELECT
eventInSeriesId,
min(ordinal) as minOrdinal,
max(ordinal) as maxOrdinal,
avg(ordinal) as avgOrdinal,
max(Ordinal)-min(Ordinal) as ordinalRange,
(max(Ordinal)-min(Ordinal)) /(max(Ordinal)-1.0) as completeness
FROM event_wikidata
WHERE ordinal is not null
GROUP BY eventInSeriesId
ORDER by 6 DESC
Not all data sources have an eventseries reference per event. As as alternative approach we analyzed the acronyms to group event series. Here we also calculated the number of distinct ordinals.
The results will in some cases mix data from event series with ambiguous acronyms and therefore give a more positive impression again. So again this is an estimate for an upper bound of the event series completion.
def testSeriesCompletenessHistogrammByAcronym(self):
'''
acronym based histogramms
'''
def histogrammSettings(plot):
'''
optional callback to add more data to histogramm
'''
pass
debug = False
self.figureList=FigureList(caption="event Series completeness by acronym",figureListLabel="eventcompa",cols=3)
for dataSource in DataSource.sources.values():
if dataSource.name in ["acm","confref"]:
continue
print(dataSource)
histOutputFileName=f"eventSeriesCompletionByAcronymHistogramm_{dataSource.name}.png"
sqlQuery = """SELECT acronym, ordinal
FROM %s
""" % (dataSource.tableName)
sqlDB = EventStorage.getSqlDB()
lod = sqlDB.query(sqlQuery)
series = {}
acronymRegexp = r'(?P<acronym>[A-Z]+)\s*[0-9]+'
for d in lod:
acronym = d.get('acronym')
if acronym:
match = re.fullmatch(acronymRegexp, acronym)
if match is None:
continue
seriesAcronym = match.group("acronym")
if isinstance(seriesAcronym, str):
if seriesAcronym in series:
series[seriesAcronym].append(d)
else:
series[seriesAcronym] = [d]
aggLod = []
for series, eventRecords in series.items():
# set operation
ordinals: List[int] = [int(r.get("ordinal"))
for r in eventRecords
if r.get("ordinal")
and ((isinstance(r.get("ordinal"), str) and r.get("ordinal").isnumeric()) or isinstance(r.get("ordinal"), int))]
if len(ordinals) == 0:
continue
minOrd = min(ordinals)
maxOrd = max(ordinals)
numberOfDistinctOrds = len(set(ordinals))
# count set content
res = {
"series": series,
"minOrdinal": minOrd,
"maxOrdinal": maxOrd,
"avgOrdinal": mean(ordinals),
"span": maxOrd-minOrd,
"available": numberOfDistinctOrds,
"completeness": numberOfDistinctOrds / maxOrd if maxOrd>1 else 1.0
}
aggLod.append(res)
figure=Figure(dataSource.title,caption=f"event series completeness of {dataSource.name}",figLabel=f"esca-{dataSource.name}",sqlQuery=None,fileNames=[histOutputFileName])
self.figureList.add(figure)
values = [round(record["completeness"], 2) for record in aggLod if isinstance(record["completeness"], float)]
values.sort()
threshold =values[len(values)//2]
h = Histogramm(x=values)
hps = PlotSettings(outputFile=f"{self.histroot}/{histOutputFileName}", callback=histogrammSettings)
h.show(xLabel='completeness',
yLabel='distribution',
title=f'{figure.title}',
alpha=self.alpha,
density=True,
ps=hps,
bins=10,
vlineAt=threshold)
print(dataSource, len(values), "→", len(values) // 2)
self.figureList.printAllMarkups()
def testSeriesCompletenessHistogrammByAcronym(self):
'''
acronym based histogramms
'''
def histogrammSettings(plot):
'''
optional callback to add more data to histogramm
'''
pass
debug = False
self.figureList=FigureList(caption="event Series completeness by acronym",figureListLabel="eventcompa",cols=3)
for dataSource in DataSource.sources.values():
if dataSource.name in ["acm","confref"]:
continue
print(dataSource)
histOutputFileName=f"eventSeriesCompletionByAcronymHistogramm_{dataSource.name}.png"
sqlQuery = """SELECT acronym, ordinal
FROM %s
""" % (dataSource.tableName)
sqlDB = EventStorage.getSqlDB()
lod = sqlDB.query(sqlQuery)
series = {}
acronymRegexp = r'(?P<acronym>[A-Z]+)\s*[0-9]+'
for d in lod:
acronym = d.get('acronym')
if acronym:
match = re.fullmatch(acronymRegexp, acronym)
if match is None:
continue
seriesAcronym = match.group("acronym")
if isinstance(seriesAcronym, str):
if seriesAcronym in series:
series[seriesAcronym].append(d)
else:
series[seriesAcronym] = [d]
aggLod = []
for series, eventRecords in series.items():
# set operation
ordinals: List[int] = [int(r.get("ordinal"))
for r in eventRecords
if r.get("ordinal")
and ((isinstance(r.get("ordinal"), str) and r.get("ordinal").isnumeric()) or isinstance(r.get("ordinal"), int))]
if len(ordinals) == 0:
continue
minOrd = min(ordinals)
maxOrd = max(ordinals)
numberOfDistinctOrds = len(set(ordinals))
# count set content
res = {
"series": series,
"minOrdinal": minOrd,
"maxOrdinal": maxOrd,
"avgOrdinal": mean(ordinals),
"span": maxOrd-minOrd,
"available": numberOfDistinctOrds,
"completeness": numberOfDistinctOrds / maxOrd if maxOrd>1 else 1.0
}
aggLod.append(res)
figure=Figure(dataSource.title,caption=f"event series completeness of {dataSource.name}",figLabel=f"esca-{dataSource.name}",sqlQuery=None,fileNames=[histOutputFileName])
self.figureList.add(figure)
values = [round(record["completeness"], 2) for record in aggLod if isinstance(record["completeness"], float)]
values.sort()
threshold =values[len(values)//2]
h = Histogramm(x=values)
hps = PlotSettings(outputFile=f"{self.histroot}/{histOutputFileName}", callback=histogrammSettings)
h.show(xLabel='completeness',
yLabel='distribution',
title=f'{figure.title}',
alpha=self.alpha,
density=True,
ps=hps,
bins=10,
vlineAt=threshold)
print(dataSource, len(values), "→", len(values) // 2)
self.figureList.printAllMarkups()