Difference between revisions of "ConferenceCorpus/statistics"

From BITPlan Wiki
Jump to navigation Jump to search
 
(17 intermediate revisions by the same user not shown)
Line 1: Line 1:
= Ordinal histogramms =
+
= Introduction =
== ceurws==
+
== SQL Query for Event Series completion ==
[[File:ordinalhistogramm_event_ceurws.png|600px]]       
 
           
 
== orbackup==
 
[[File:ordinalhistogramm_event_orbackup.png|600px]]       
 
           
 
== orclonebackup==
 
[[File:ordinalhistogramm_event_orclonebackup.png|600px]]       
 
           
 
== confref==
 
[[File:ordinalhistogramm_event_confref.png|600px]]       
 
           
 
== gnd==
 
[[File:ordinalhistogramm_event_gnd.png|600px]]       
 
           
 
== wikicfp==
 
[[File:ordinalhistogramm_event_wikicfp.png|600px]]       
 
           
 
== or==
 
[[File:ordinalhistogramm_event_or.png|600px]]       
 
           
 
== orclone==
 
[[File:ordinalhistogramm_event_orclone.png|600px]]       
 
           
 
== tibkat==
 
[[File:ordinalhistogramm_event_tibkat.png|600px]]       
 
           
 
== dblp==
 
[[File:ordinalhistogramm_event_dblp.png|600px]]       
 
           
 
== crossref==
 
[[File:ordinalhistogramm_event_crossref.png|600px]]       
 
           
 
== wikidata==
 
[[File:ordinalhistogramm_event_wikidata.png|600px]]
 
= Eventseries completeness =
 
== dblp ==
 
=== sql query ===
 
 
<source lang='sql'>
 
<source lang='sql'>
 
SELECT  
 
SELECT  
Line 50: Line 13:
 
group by series
 
group by series
 
order by 6 desc
 
order by 6 desc
               
 
 
</source>
 
</source>
=== histogramm ===
+
= Event Signature Availability =
 +
see also http://conferencecorpus.bitplan.com/query/SignatureAvailability
 +
=== signature completeness combined ===
 +
[[File:completeSignature_complete.png|600px]]
 +
=== signature completeness of acronym ===
 +
[[File:completeSignature_acronym.png|600px]]
 +
=== signature completeness of startDate ===
 +
[[File:completeSignature_startDate.png|600px]]
 +
=== signature completeness of ordinal ===
 +
[[File:completeSignature_ordinal.png|600px]]
 +
=== signature completeness of year ===
 +
[[File:completeSignature_year.png|600px]]
 +
=== signature completeness of title ===
 +
[[File:completeSignature_title.png|600px]]
 +
=== signature completeness of city ===
 +
[[File:completeSignature_city.png|600px]]
 +
=== signature completeness of country ===
 +
[[File:completeSignature_country.png|600px]
 +
 
 +
= Ordinal histogramms =
 +
The Zipf digramms show the log frequency (leaving out the first event, since typically the decline begins from the second event)
 +
 
 +
== confref==
 +
To few available ordinals for analysis.   
 +
           
 +
== CEUR-WS ==
 +
=== sql query ===
 +
<source lang='sql'>
 +
SELECT ordinal
 +
    FROM event_ceurws
 +
    where ordinal is not null
 +
    and ordinal < 50
 +
   
 +
</source>
 +
=== ceurws ordinals ===
 +
[[File:ordinalhistogramm_event_ceurws.png|600px]][[File:zipf_event_ceurws.png|600px]]
 +
== confref ==
 +
not enough data
 +
 
 +
== Crossref ==
 +
=== sql query ===
 +
<source lang='sql'>
 +
SELECT ordinal
 +
    FROM event_crossref
 +
    where ordinal is not null
 +
    and ordinal < 50
 +
   
 +
</source>
 +
=== crossref ordinals ===
 +
[[File:ordinalhistogramm_event_crossref.png|600px]][[File:zipf_event_crossref.png|600px]]
 +
== dblp ==
 +
=== sql query ===
 +
<source lang='sql'>
 +
SELECT ordinal
 +
    FROM event_dblp
 +
    where ordinal is not null
 +
    and ordinal < 50
 +
   
 +
</source>
 +
=== dblp ordinals ===
 +
[[File:ordinalhistogramm_event_dblp.png|600px]][[File:zipf_event_dblp.png|600px]]
 +
== GND ==
 +
=== sql query ===
 +
<source lang='sql'>
 +
SELECT ordinal
 +
    FROM event_gnd
 +
    where ordinal is not null
 +
    and ordinal < 50
 +
   
 +
</source>
 +
=== gnd ordinals ===
 +
[[File:ordinalhistogramm_event_gnd.png|600px]][[File:zipf_event_gnd.png|600px]]
 +
== OpenResearch ==
 +
=== sql query ===
 +
<source lang='sql'>
 +
SELECT ordinal
 +
    FROM event_or
 +
    where ordinal is not null
 +
    and ordinal < 50
 +
   
 +
</source>
 +
=== or ordinals ===
 +
[[File:ordinalhistogramm_event_or.png|600px]][[File:zipf_event_or.png|600px]]
 +
== OpenResearch ==
 +
=== sql query ===
 +
<source lang='sql'>
 +
SELECT ordinal
 +
    FROM event_orbackup
 +
    where ordinal is not null
 +
    and ordinal < 50
 +
   
 +
</source>
 +
=== orbackup ordinals ===
 +
[[File:ordinalhistogramm_event_orbackup.png|600px]][[File:zipf_event_orbackup.png|600px]]
 +
== OpenResearch ==
 +
=== sql query ===
 +
<source lang='sql'>
 +
SELECT ordinal
 +
    FROM event_orclone
 +
    where ordinal is not null
 +
    and ordinal < 50
 +
   
 +
</source>
 +
=== orclone ordinals ===
 +
[[File:ordinalhistogramm_event_orclone.png|600px]][[File:zipf_event_orclone.png|600px]]
 +
== OpenResearch ==
 +
=== sql query ===
 +
<source lang='sql'>
 +
SELECT ordinal
 +
    FROM event_orclonebackup
 +
    where ordinal is not null
 +
    and ordinal < 50
 +
   
 +
</source>
 +
=== orclonebackup ordinals ===
 +
[[File:ordinalhistogramm_event_orclonebackup.png|600px]][[File:zipf_event_orclonebackup.png|600px]]
 +
== TIBKAT ==
 +
=== sql query ===
 +
<source lang='sql'>
 +
SELECT ordinal
 +
    FROM event_tibkat
 +
    where ordinal is not null
 +
    and ordinal < 50
 +
   
 +
</source>
 +
=== tibkat ordinals ===
 +
[[File:ordinalhistogramm_event_tibkat.png|600px]][[File:zipf_event_tibkat.png|600px]]
 +
== WikiCFP ==
 +
=== sql query ===
 +
<source lang='sql'>
 +
SELECT ordinal
 +
    FROM event_wikicfp
 +
    where ordinal is not null
 +
    and ordinal < 50
 +
   
 +
</source>
 +
=== wikicfp ordinals ===
 +
[[File:ordinalhistogramm_event_wikicfp.png|600px]][[File:zipf_event_wikicfp.png|600px]]
 +
== Wikidata ==
 +
=== sql query ===
 +
<source lang='sql'>
 +
SELECT ordinal
 +
    FROM event_wikidata
 +
    where ordinal is not null
 +
    and ordinal < 50
 +
   
 +
</source>
 +
=== wikidata ordinals ===
 +
[[File:ordinalhistogramm_event_wikidata.png|600px]][[File:zipf_event_wikidata.png|600px]]
 +
 
 +
= Eventseries completeness =
 +
The queries here only give a rough estimate for an upper bound since the assumption here is that all events between max(ordinal) and min(ordinal) are available which is often not the case.
 +
== dblp ==
 +
=== sql query ===
 +
<source lang='sql'>
 +
SELECT
 +
      series,
 +
      min(ordinal) as minOrdinal,
 +
      max(ordinal) as maxOrdinal,
 +
      avg(ordinal) as avgOrdinal,
 +
      max(Ordinal)-min(Ordinal) as ordinalRange,
 +
      (max(Ordinal)-min(Ordinal)) /(max(Ordinal)-1.0) as completeness
 +
FROM event_dblp
 +
WHERE ordinal is not null
 +
GROUP BY series
 +
ORDER by 6 DESC
 +
                   
 +
</source>
 +
=== event series completeness of dblp ===
 
[[File:dblp_series_completeness.png|600px]]
 
[[File:dblp_series_completeness.png|600px]]
 
+
== OpenResearch ==
== openresearch ==
 
 
=== sql query ===
 
=== sql query ===
 
<source lang='sql'>
 
<source lang='sql'>
 
SELECT  
 
SELECT  
  inEventSeries,
+
      inEventSeries,
  min(ordinal) as minOrdinal,  
+
      min(ordinal) as minOrdinal,  
  max(ordinal) as maxOrdinal,
+
      max(ordinal) as maxOrdinal,
  avg(ordinal) as avgOrdinal,
+
      avg(ordinal) as avgOrdinal,
  max(Ordinal)-min(Ordinal) as available,
+
      max(Ordinal)-min(Ordinal) as ordinalRange,
  (max(Ordinal)-min(Ordinal)) /(max(Ordinal)-1.0) as completeness
+
      (max(Ordinal)-min(Ordinal)) /(max(Ordinal)-1.0) as completeness
 
FROM event_orclone
 
FROM event_orclone
Where ordinal is not null  
+
WHERE ordinal is not null  
group by inEventSeries
+
GROUP BY inEventSeries
order by 6 desc
+
ORDER by 6 DESC
               
+
                   
 
</source>
 
</source>
=== histogramm ===
+
=== event series completeness of orclone ===
 
[[File:orclone_series_completeness.png|600px]]
 
[[File:orclone_series_completeness.png|600px]]
 
+
== WikiCFP ==
== wikicfp ==
 
 
=== sql query ===
 
=== sql query ===
 
<source lang='sql'>
 
<source lang='sql'>
 
SELECT  
 
SELECT  
  seriesId,
+
      seriesId,
  min(ordinal) as minOrdinal,  
+
      min(ordinal) as minOrdinal,  
  max(ordinal) as maxOrdinal,
+
      max(ordinal) as maxOrdinal,
  avg(ordinal) as avgOrdinal,
+
      avg(ordinal) as avgOrdinal,
  max(Ordinal)-min(Ordinal) as available,
+
      max(Ordinal)-min(Ordinal) as ordinalRange,
  (max(Ordinal)-min(Ordinal)) /(max(Ordinal)-1.0) as completeness
+
      (max(Ordinal)-min(Ordinal)) /(max(Ordinal)-1.0) as completeness
 
FROM event_wikicfp
 
FROM event_wikicfp
Where ordinal is not null  
+
WHERE ordinal is not null  
group by seriesId
+
GROUP BY seriesId
order by 6 desc
+
ORDER by 6 DESC
               
+
                   
 
</source>
 
</source>
=== histogramm ===
+
=== event series completeness of wikicfp ===
 
[[File:wikicfp_series_completeness.png|600px]]
 
[[File:wikicfp_series_completeness.png|600px]]
== wikidata ==
+
== Wikidata ==
 
=== sql query ===
 
=== sql query ===
 
<source lang='sql'>
 
<source lang='sql'>
 
SELECT  
 
SELECT  
  eventInSeriesId,
+
      eventInSeriesId,
  min(ordinal) as minOrdinal,  
+
      min(ordinal) as minOrdinal,  
  max(ordinal) as maxOrdinal,
+
      max(ordinal) as maxOrdinal,
  avg(ordinal) as avgOrdinal,
+
      avg(ordinal) as avgOrdinal,
  max(Ordinal)-min(Ordinal) as available,
+
      max(Ordinal)-min(Ordinal) as ordinalRange,
  (max(Ordinal)-min(Ordinal)) /(max(Ordinal)-1.0) as completeness
+
      (max(Ordinal)-min(Ordinal)) /(max(Ordinal)-1.0) as completeness
 
FROM event_wikidata
 
FROM event_wikidata
Where ordinal is not null  
+
WHERE ordinal is not null  
group by eventInSeriesId
+
GROUP BY eventInSeriesId
order by 6 desc
+
ORDER by 6 DESC
               
+
                   
 
</source>
 
</source>
=== histogramm ===
+
=== event series completeness of wikidata ===
 
[[File:wikidata_series_completeness.png|600px]]
 
[[File:wikidata_series_completeness.png|600px]]
 +
 +
= Eventseries completeness by acronym =
 +
Not all data sources have an eventseries reference per event. As as alternative approach we analyzed the acronyms to group event series. Here we also
 +
calculated the number of distinct ordinals.
 +
 +
The results will in some cases mix data from event series with ambiguous acronyms and therefore give a more positive impression again. So again this is an estimate for an upper bound of the event series completion.
 +
== Python Code for Eventseries completeness analyis ===
 +
<source lang='python'>
 +
  def testSeriesCompletenessHistogrammByAcronym(self):
 +
        '''
 +
        acronym based histogramms
 +
        '''
 +
        def histogrammSettings(plot):
 +
            '''
 +
            optional callback to add more data to histogramm
 +
            '''
 +
            pass
 +
       
 +
        debug = False
 +
        self.figureList=FigureList(caption="event Series completeness by acronym",figureListLabel="eventcompa",cols=3)
 +
        for dataSource in DataSource.sources.values():
 +
            if dataSource.name in ["acm","confref"]:
 +
                continue
 +
            print(dataSource)
 +
            histOutputFileName=f"eventSeriesCompletionByAcronymHistogramm_{dataSource.name}.png"
 +
            sqlQuery = """SELECT acronym, ordinal
 +
                FROM %s
 +
                """ % (dataSource.tableName)
 +
            sqlDB = EventStorage.getSqlDB()
 +
            lod = sqlDB.query(sqlQuery)
 +
            series = {}
 +
            acronymRegexp = r'(?P<acronym>[A-Z]+)\s*[0-9]+'
 +
            for d in lod:
 +
                acronym = d.get('acronym')
 +
                if acronym:
 +
                    match = re.fullmatch(acronymRegexp, acronym)
 +
                    if match is None:
 +
                        continue
 +
                    seriesAcronym = match.group("acronym")
 +
                    if isinstance(seriesAcronym, str):
 +
                        if seriesAcronym in series:
 +
                            series[seriesAcronym].append(d)
 +
                        else:
 +
                            series[seriesAcronym] = [d]
 +
            aggLod = []
 +
            for series, eventRecords in series.items():
 +
                # set operation
 +
                ordinals: List[int] = [int(r.get("ordinal"))
 +
                                      for r in eventRecords
 +
                                      if r.get("ordinal")
 +
                                      and ((isinstance(r.get("ordinal"), str) and r.get("ordinal").isnumeric()) or isinstance(r.get("ordinal"), int))]
 +
                if len(ordinals) == 0:
 +
                    continue
 +
                minOrd = min(ordinals)
 +
                maxOrd = max(ordinals)
 +
                numberOfDistinctOrds = len(set(ordinals))
 +
                # count set content
 +
                res = {
 +
                    "series": series,
 +
                    "minOrdinal": minOrd,
 +
                    "maxOrdinal": maxOrd,
 +
                    "avgOrdinal": mean(ordinals),
 +
                    "span": maxOrd-minOrd,
 +
                    "available": numberOfDistinctOrds,
 +
                    "completeness": numberOfDistinctOrds / maxOrd if maxOrd>1 else 1.0
 +
                }
 +
                aggLod.append(res)
 +
            figure=Figure(dataSource.title,caption=f"event series completeness of {dataSource.name}",figLabel=f"esca-{dataSource.name}",sqlQuery=None,fileNames=[histOutputFileName])
 +
            self.figureList.add(figure)
 +
             
 +
            values = [round(record["completeness"], 2) for record in aggLod if isinstance(record["completeness"], float)]
 +
            values.sort()
 +
            threshold =values[len(values)//2]
 +
            h = Histogramm(x=values)
 +
            hps = PlotSettings(outputFile=f"{self.histroot}/{histOutputFileName}", callback=histogrammSettings)
 +
            h.show(xLabel='completeness',
 +
                  yLabel='distribution',
 +
                  title=f'{figure.title}',
 +
                  alpha=self.alpha,
 +
                  density=True,
 +
                  ps=hps,
 +
                  bins=10,
 +
                  vlineAt=threshold)
 +
       
 +
            print(dataSource, len(values), "→", len(values) // 2)
 +
        self.figureList.printAllMarkups()
 +
</source>
 +
== CEUR-WS ==
 +
=== event series completeness of ceurws ===
 +
[[File:eventSeriesCompletionByAcronymHistogramm_ceurws.png|600px]]
 +
== confref ==
 +
== Crossref ==
 +
=== event series completeness of crossref ===
 +
[[File:eventSeriesCompletionByAcronymHistogramm_crossref.png|600px]]
 +
== dblp ==
 +
=== event series completeness of dblp ===
 +
[[File:eventSeriesCompletionByAcronymHistogramm_dblp.png|600px]]
 +
== GND ==
 +
=== event series completeness of gnd ===
 +
[[File:eventSeriesCompletionByAcronymHistogramm_gnd.png|600px]]
 +
== OpenResearch ==
 +
=== event series completeness of or ===
 +
[[File:eventSeriesCompletionByAcronymHistogramm_or.png|600px]]
 +
== OpenResearch ==
 +
=== event series completeness of orbackup ===
 +
[[File:eventSeriesCompletionByAcronymHistogramm_orbackup.png|600px]]
 +
== OpenResearch ==
 +
=== event series completeness of orclone ===
 +
[[File:eventSeriesCompletionByAcronymHistogramm_orclone.png|600px]]
 +
== OpenResearch ==
 +
=== event series completeness of orclonebackup ===
 +
[[File:eventSeriesCompletionByAcronymHistogramm_orclonebackup.png|600px]]
 +
== TIBKAT ==
 +
=== event series completeness of tibkat ===
 +
[[File:eventSeriesCompletionByAcronymHistogramm_tibkat.png|600px]]
 +
== WikiCFP ==
 +
=== event series completeness of wikicfp ===
 +
[[File:eventSeriesCompletionByAcronymHistogramm_wikicfp.png|600px]]
 +
== Wikidata ==
 +
=== event series completeness of wikidata ===
 +
[[File:eventSeriesCompletionByAcronymHistogramm_wikidata.png|600px]]

Latest revision as of 11:36, 29 May 2022

Introduction

SQL Query for Event Series completion

SELECT 
   series,
   min(ordinal) as minOrdinal, 
   max(ordinal) as maxOrdinal,
   avg(ordinal) as avgOrdinal,
   max(Ordinal)-min(Ordinal) as available,
   (max(Ordinal)-min(Ordinal)) /(max(Ordinal)-1.0) as completeness
FROM event_dblp
Where ordinal is not null 
group by series
order by 6 desc

Event Signature Availability

see also http://conferencecorpus.bitplan.com/query/SignatureAvailability

signature completeness combined

CompleteSignature complete.png

signature completeness of acronym

CompleteSignature acronym.png

signature completeness of startDate

CompleteSignature startDate.png

signature completeness of ordinal

CompleteSignature ordinal.png

signature completeness of year

CompleteSignature year.png

signature completeness of title

CompleteSignature title.png

signature completeness of city

CompleteSignature city.png

signature completeness of country

[[File:completeSignature_country.png|600px]

Ordinal histogramms

The Zipf digramms show the log frequency (leaving out the first event, since typically the decline begins from the second event)

confref

To few available ordinals for analysis.

CEUR-WS

sql query

SELECT ordinal
    FROM event_ceurws
    where ordinal is not null
    and ordinal < 50

ceurws ordinals

Ordinalhistogramm event ceurws.pngZipf event ceurws.png

confref

not enough data

Crossref

sql query

SELECT ordinal
    FROM event_crossref
    where ordinal is not null
    and ordinal < 50

crossref ordinals

Ordinalhistogramm event crossref.pngZipf event crossref.png

dblp

sql query

SELECT ordinal
    FROM event_dblp
    where ordinal is not null
    and ordinal < 50

dblp ordinals

Ordinalhistogramm event dblp.pngZipf event dblp.png

GND

sql query

SELECT ordinal
    FROM event_gnd
    where ordinal is not null
    and ordinal < 50

gnd ordinals

Ordinalhistogramm event gnd.pngZipf event gnd.png

OpenResearch

sql query

SELECT ordinal
    FROM event_or
    where ordinal is not null
    and ordinal < 50

or ordinals

Ordinalhistogramm event or.pngZipf event or.png

OpenResearch

sql query

SELECT ordinal
    FROM event_orbackup
    where ordinal is not null
    and ordinal < 50

orbackup ordinals

Ordinalhistogramm event orbackup.pngZipf event orbackup.png

OpenResearch

sql query

SELECT ordinal
    FROM event_orclone
    where ordinal is not null
    and ordinal < 50

orclone ordinals

Ordinalhistogramm event orclone.pngZipf event orclone.png

OpenResearch

sql query

SELECT ordinal
    FROM event_orclonebackup
    where ordinal is not null
    and ordinal < 50

orclonebackup ordinals

Ordinalhistogramm event orclonebackup.pngZipf event orclonebackup.png

TIBKAT

sql query

SELECT ordinal
    FROM event_tibkat
    where ordinal is not null
    and ordinal < 50

tibkat ordinals

Ordinalhistogramm event tibkat.pngZipf event tibkat.png

WikiCFP

sql query

SELECT ordinal
    FROM event_wikicfp
    where ordinal is not null
    and ordinal < 50

wikicfp ordinals

Ordinalhistogramm event wikicfp.pngZipf event wikicfp.png

Wikidata

sql query

SELECT ordinal
    FROM event_wikidata
    where ordinal is not null
    and ordinal < 50

wikidata ordinals

Ordinalhistogramm event wikidata.pngZipf event wikidata.png

Eventseries completeness

The queries here only give a rough estimate for an upper bound since the assumption here is that all events between max(ordinal) and min(ordinal) are available which is often not the case.

dblp

sql query

SELECT 
       series,
       min(ordinal) as minOrdinal, 
       max(ordinal) as maxOrdinal,
       avg(ordinal) as avgOrdinal,
       max(Ordinal)-min(Ordinal) as ordinalRange,
       (max(Ordinal)-min(Ordinal)) /(max(Ordinal)-1.0) as completeness
FROM event_dblp
WHERE ordinal is not null 
GROUP BY series
ORDER by 6 DESC

event series completeness of dblp

Dblp series completeness.png

OpenResearch

sql query

SELECT 
       inEventSeries,
       min(ordinal) as minOrdinal, 
       max(ordinal) as maxOrdinal,
       avg(ordinal) as avgOrdinal,
       max(Ordinal)-min(Ordinal) as ordinalRange,
       (max(Ordinal)-min(Ordinal)) /(max(Ordinal)-1.0) as completeness
FROM event_orclone
WHERE ordinal is not null 
GROUP BY inEventSeries
ORDER by 6 DESC

event series completeness of orclone

Orclone series completeness.png

WikiCFP

sql query

SELECT 
       seriesId,
       min(ordinal) as minOrdinal, 
       max(ordinal) as maxOrdinal,
       avg(ordinal) as avgOrdinal,
       max(Ordinal)-min(Ordinal) as ordinalRange,
       (max(Ordinal)-min(Ordinal)) /(max(Ordinal)-1.0) as completeness
FROM event_wikicfp
WHERE ordinal is not null 
GROUP BY seriesId
ORDER by 6 DESC

event series completeness of wikicfp

Wikicfp series completeness.png

Wikidata

sql query

SELECT 
       eventInSeriesId,
       min(ordinal) as minOrdinal, 
       max(ordinal) as maxOrdinal,
       avg(ordinal) as avgOrdinal,
       max(Ordinal)-min(Ordinal) as ordinalRange,
       (max(Ordinal)-min(Ordinal)) /(max(Ordinal)-1.0) as completeness
FROM event_wikidata
WHERE ordinal is not null 
GROUP BY eventInSeriesId
ORDER by 6 DESC

event series completeness of wikidata

Wikidata series completeness.png

Eventseries completeness by acronym

Not all data sources have an eventseries reference per event. As as alternative approach we analyzed the acronyms to group event series. Here we also calculated the number of distinct ordinals.

The results will in some cases mix data from event series with ambiguous acronyms and therefore give a more positive impression again. So again this is an estimate for an upper bound of the event series completion.

Python Code for Eventseries completeness analyis =

   def testSeriesCompletenessHistogrammByAcronym(self):
        '''
        acronym based histogramms
        '''
        def histogrammSettings(plot):
            '''
            optional callback to add more data to histogramm
            '''
            pass
        
        debug = False
        self.figureList=FigureList(caption="event Series completeness by acronym",figureListLabel="eventcompa",cols=3)
        for dataSource in DataSource.sources.values():
            if dataSource.name in ["acm","confref"]:
                continue
            print(dataSource)
            histOutputFileName=f"eventSeriesCompletionByAcronymHistogramm_{dataSource.name}.png"
            sqlQuery = """SELECT acronym, ordinal
                FROM %s
                """ % (dataSource.tableName)
            sqlDB = EventStorage.getSqlDB()
            lod = sqlDB.query(sqlQuery)
            series = {}
            acronymRegexp = r'(?P<acronym>[A-Z]+)\s*[0-9]+'
            for d in lod:
                acronym = d.get('acronym')
                if acronym:
                    match = re.fullmatch(acronymRegexp, acronym)
                    if match is None:
                        continue
                    seriesAcronym = match.group("acronym")
                    if isinstance(seriesAcronym, str):
                        if seriesAcronym in series:
                            series[seriesAcronym].append(d)
                        else:
                            series[seriesAcronym] = [d]
            aggLod = []
            for series, eventRecords in series.items():
                # set operation
                ordinals: List[int] = [int(r.get("ordinal"))
                                       for r in eventRecords
                                       if r.get("ordinal")
                                       and ((isinstance(r.get("ordinal"), str) and r.get("ordinal").isnumeric()) or isinstance(r.get("ordinal"), int))]
                if len(ordinals) == 0:
                    continue
                minOrd = min(ordinals)
                maxOrd = max(ordinals)
                numberOfDistinctOrds = len(set(ordinals))
                # count set content
                res = {
                    "series": series,
                    "minOrdinal": minOrd,
                    "maxOrdinal": maxOrd,
                    "avgOrdinal": mean(ordinals),
                    "span": maxOrd-minOrd,
                    "available": numberOfDistinctOrds,
                    "completeness": numberOfDistinctOrds / maxOrd if maxOrd>1 else 1.0
                }
                aggLod.append(res)
            figure=Figure(dataSource.title,caption=f"event series completeness of {dataSource.name}",figLabel=f"esca-{dataSource.name}",sqlQuery=None,fileNames=[histOutputFileName])
            self.figureList.add(figure)
              
            values = [round(record["completeness"], 2) for record in aggLod if isinstance(record["completeness"], float)]
            values.sort()
            threshold =values[len(values)//2]
            h = Histogramm(x=values)
            hps = PlotSettings(outputFile=f"{self.histroot}/{histOutputFileName}", callback=histogrammSettings)
            h.show(xLabel='completeness',
                   yLabel='distribution',
                   title=f'{figure.title}',
                   alpha=self.alpha,
                   density=True,
                   ps=hps,
                   bins=10,
                   vlineAt=threshold)
        
            print(dataSource, len(values), "→", len(values) // 2)
        self.figureList.printAllMarkups()

CEUR-WS

event series completeness of ceurws

EventSeriesCompletionByAcronymHistogramm ceurws.png

confref

Crossref

event series completeness of crossref

EventSeriesCompletionByAcronymHistogramm crossref.png

dblp

event series completeness of dblp

EventSeriesCompletionByAcronymHistogramm dblp.png

GND

event series completeness of gnd

EventSeriesCompletionByAcronymHistogramm gnd.png

OpenResearch

event series completeness of or

EventSeriesCompletionByAcronymHistogramm or.png

OpenResearch

event series completeness of orbackup

EventSeriesCompletionByAcronymHistogramm orbackup.png

OpenResearch

event series completeness of orclone

EventSeriesCompletionByAcronymHistogramm orclone.png

OpenResearch

event series completeness of orclonebackup

EventSeriesCompletionByAcronymHistogramm orclonebackup.png

TIBKAT

event series completeness of tibkat

EventSeriesCompletionByAcronymHistogramm tibkat.png

WikiCFP

event series completeness of wikicfp

EventSeriesCompletionByAcronymHistogramm wikicfp.png

Wikidata

event series completeness of wikidata

EventSeriesCompletionByAcronymHistogramm wikidata.png