Truly Tabular RDF/GND: Difference between revisions
Jump to navigation
Jump to search
No edit summary |
|||
| (2 intermediate revisions by the same user not shown) | |||
| Line 1: | Line 1: | ||
{{Link|target=Truly Tabular RDF}} | |||
{{GlossaryEntry | {{GlossaryEntry | ||
|responsible=https://www.dnb.de/DE/Professionell/Standardisierung/GND/gnd.html | |responsible=https://www.dnb.de/DE/Professionell/Standardisierung/GND/gnd.html | ||
| Line 20: | Line 21: | ||
* https://blog.lobid.org/2018/08/27/openrefine.html | * https://blog.lobid.org/2018/08/27/openrefine.html | ||
== GND property multiplicity == | == GND property multiplicity == | ||
Based on analysis of https://data.dnb.de/opendata/authorities-kongress_lds.ttl.gz as of 2021-12 | |||
{| class="wikitable" | {| class="wikitable" | ||
|+ Property multiplicity | |+ Property multiplicity | ||
| Line 122: | Line 123: | ||
| style=" "| 0.0163 | | style=" "| 0.0163 | ||
|} | |} | ||
=== performance optimized query of GND event details === | |||
<source lang='sparql> | |||
# performance optimized query of GND event details | |||
# with aggregated properties as single, count and | separated list column | |||
# WF 2021-12-05 | |||
PREFIX gndi: <https://d-nb.info/gnd> | |||
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#> | |||
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/> | |||
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> | |||
PREFIX owl: <http://www.w3.org/2002/07/owl#> | |||
PREFIX dc: <http://purl.org/dc/terms/> | |||
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#> | |||
SELECT | |||
?event | |||
?eventId | |||
(MIN(?eventTitle) as ?title) | |||
(COUNT (DISTINCT ?eventDate) as ?dateCount) | |||
(MIN(?eventDate) as ?date) | |||
(MIN(?eventAcronym) as ?acronym) | |||
(COUNT (DISTINCT ?eventAcronym) as ?acronymCount) | |||
(GROUP_CONCAT(DISTINCT ?eventAcronym; SEPARATOR="| ") AS ?acronyms) | |||
(MIN(?eventVariant) as ?variant) | |||
(COUNT (DISTINCT ?eventVariant) as ?variantCount) | |||
(GROUP_CONCAT(DISTINCT ?eventVariant; SEPARATOR="| ") AS ?variants) | |||
(MIN(?eventPlace) as ?place) | |||
(COUNT (DISTINCT ?eventPlace) as ?placeCount) | |||
(GROUP_CONCAT(DISTINCT ?eventPlace; SEPARATOR="| ") AS ?places) | |||
(MIN(?eventHomepage) as ?homepage) | |||
WHERE { | |||
?event a gnd:ConferenceOrEvent. | |||
?event gnd:gndIdentifier ?eventId. | |||
?event gnd:preferredNameForTheConferenceOrEvent ?eventTitle. | |||
OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?eventAcronym. } | |||
OPTIONAL { ?event gnd:homepage ?eventHomepage. } | |||
OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?eventVariant. } | |||
OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?eventDate. } | |||
OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?eventPlace } | |||
# only available 3520 times 2021-12 | |||
# ?event gnd:topic ?topic. | |||
# only available 12106 times 2021-12 | |||
# ?event gnd:precedingConferenceOrEvent ?prec | |||
# only available 11929 times 2021-12 | |||
#?event gnd:succeedingConferenceOrEvent ?succ | |||
} | |||
GROUP BY ?event ?eventId | |||
</source> | |||
=== query to analyze multiplicity === | === query to analyze multiplicity === | ||
<source lang='sparql> | <source lang='sparql> | ||
| Line 161: | Line 214: | ||
</source> | </source> | ||
== | == SPARQL Queries == | ||
<source lang=' | === entities and usage frequency === | ||
# | <source lang='SPARQL'> | ||
# get histogramm data of entities by | |||
# usage frequency | |||
# WF 2020-06-27 | |||
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#> | |||
SELECT ?c (COUNT(?c) AS ?count) | |||
WHERE { | |||
?subject a ?c | |||
} | |||
GROUP BY ?c | |||
HAVING (?count >100) | |||
ORDER BY DESC(?count) | |||
</source> | |||
{|class="wikitable sortable" | |||
|+ | |||
!c!!count | |||
|- | |||
||gnd#ConferenceOrEvent||713310 | |||
|- | |||
||gnd#TerritorialCorporateBodyOrAdministrativeUnit||188246 | |||
|- | |||
||gnd#SeriesOfConferenceOrEvent||122970 | |||
|- | |||
||gnd#BuildingOrMemorial||67149 | |||
|- | |||
||http://www.opengis.net/ont/sf#Point||57987 | |||
|- | |||
||gnd#PlaceOrGeographicName||27771 | |||
|- | |||
||gnd#NaturalGeographicUnit||20269 | |||
|- | |||
||gnd#AdministrativeUnit||12846 | |||
|- | |||
||gnd#WayBorderOrLine||4971 | |||
|- | |||
||gnd#ReligiousTerritory||2646 | |||
|- | |||
||gnd#NameOfSmallGeographicUnitLyingWithinAnotherGeographicUnit||2113 | |||
|- | |||
||gnd#CorporateBody||559 | |||
|- | |||
||gnd#MemberState||543 | |||
|- | |||
||gnd#Country||307 | |||
|- | |||
||gnd#ExtraterrestrialTerritory||282 | |||
|- | |||
||gnd#Language||193 | |||
|- | |||
||gnd#ReligiousCorporateBody||155 | |||
|- | |||
||gnd#ReligiousAdministrativeUnit||134 | |||
|- | |||
||gnd#HistoricSingleEventOrEra||128 | |||
|} | |||
=== relevance of fields === | |||
<source lang='SPARQL'> | |||
# get histogramm data of properties by | |||
# usage frequency | |||
# WF 2020-07-12 | |||
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#> | |||
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> | |||
PREFIX owl: <http://www.w3.org/2002/07/owl#> | |||
PREFIX dc: <http://purl.org/dc/terms/> | |||
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#> | |||
SELECT ?property (COUNT(?property) AS ?propTotal) | |||
WHERE { ?s ?property ?o . } | |||
GROUP BY ?property | |||
HAVING (?propTotal >1000) | |||
ORDER BY DESC(?propTotal) | |||
</source> | </source> | ||
= | {|class="wikitable sortable" | ||
|+ | |||
!property!!propTotal | |||
|- | |||
||http://www.w3.org/2002/07/owl#sameAs||1359115 | |||
|- | |||
||http://www.w3.org/1999/02/22-rdf-syntax-ns#type||1222895 | |||
|- | |||
||gnd#geographicAreaCode||1195102 | |||
|- | |||
||http://purl.org/dc/terms/license||1149468 | |||
| | |- | ||
||http://purl.org/dc/terms/modified||1149468 | |||
|- | |||
||http://www.w3.org/2007/05/powder-s#describedby||1149468 | |||
|- | |||
||gnd#gndIdentifier||1149468 | |||
|- | |||
||gnd#oldAuthorityNumber||940550 | |||
|- | |||
||gnd#preferredNameForTheConferenceOrEvent||836397 | |||
|- | |||
||gnd#variantNameForTheConferenceOrEvent||720355 | |||
|- | |||
||gnd#dateOfConferenceOrEvent||693196 | |||
|- | |||
||gnd#placeOfConferenceOrEvent||650165 | |||
|- | |||
||gnd#preferredNameForThePlaceOrGeographicName||313058 | |||
|- | |||
||gnd#variantNameForThePlaceOrGeographicName||298383 | |||
|- | |||
||gnd#gndSubjectCategory||176395 | |||
|- | |||
||gnd#definition||157695 | |||
|- | |||
||gnd#broaderTermInstantial||117334 | |||
|- | |||
||gnd#place||82345 | |||
|- | |||
||http://xmlns.com/foaf/0.1/page||74542 | |||
|- | |||
||http://www.opengis.net/ont/geosparql#asWKT||58083 | |||
|- | |||
||http://www.opengis.net/ont/geosparql#hasGeometry||58083 | |||
|- | |||
||https://d-nb.info/standards/elementset/dnb#deprecatedUri||51576 | |||
|- | |||
||gnd#biographicalOrHistoricalInformation||34190 | |||
|- | |||
||gnd#organizerOrHost||34167 | |||
|- | |||
||gnd#relatedDdcWithDegreeOfDeterminacy2||25452 | |||
|- | |||
||gnd#succeedingPlaceOrGeographicName||25208 | |||
|- | |||
||gnd#broaderTermPartitive||25111 | |||
|- | |||
||gnd#dateOfEstablishment||24772 | |||
|- | |||
||gnd#homepage||24439 | |||
|- | |||
||gnd#precedingPlaceOrGeographicName||22490 | |||
|- | |||
||gnd#dateOfTermination||21582 | |||
|- | |||
||gnd#hierarchicalSuperiorOfPlaceOrGeographicName||20522 | |||
|- | |||
||gnd#precedingConferenceOrEvent||17164 | |||
|- | |||
||gnd#succeedingConferenceOrEvent||16963 | |||
|- | |||
||gnd#dateOfProduction||15106 | |||
|- | |||
||gnd#spatialAreaOfActivity||13593 | |||
|- | |||
||gnd#hierarchicalSuperiorOfTheConferenceOrEvent||10126 | |||
|- | |||
||gnd#architect||8206 | |||
|- | |||
||gnd#topic||6088 | |||
|- | |- | ||
||gnd#relatedPlaceOrGeographicName||5812 | |||
|- | |- | ||
| | ||gnd#abbreviatedNameForTheConferenceOrEvent||5312 | ||
|- | |- | ||
| | ||gnd#complexSeeReferenceSubject||3927 | ||
|- | |- | ||
| | ||gnd#startingOrFinalPointOfADistance||3531 | ||
|- | |- | ||
| | ||gnd#relatedConferenceOrEvent||3124 | ||
|- | |- | ||
| | ||gnd#relatedDdcWithDegreeOfDeterminacy4||2709 | ||
|- | |- | ||
| | ||gnd#relatedCorporateBody||2354 | ||
|- | |- | ||
| | ||gnd#relatedTerm||1728 | ||
|- | |- | ||
| | ||gnd#sponsorOrPatron||1281 | ||
|- | |- | ||
| | ||gnd#relatedDdcWithDegreeOfDeterminacy3||1200 | ||
|- | |- | ||
| | ||gnd#exhibitor||1044 | ||
|} | |} | ||
}} | {{Fixme|todo=Unfortunately the headlines and the forms seem to be mixed up.|done=2020-06-25|by=--[[User:Wf|Wf]] ([[User talk:Wf|talk]]) 14:59, 25 June 2020 (CEST)}} | ||
== events with most often used fields and seldom but useful fields == | |||
<source lang='sparql'> | |||
# get events with most often used columns from GND | |||
# plus acronym, topic, homepage (seldom but useful) | |||
# WF 2020-07-12 | |||
PREFIX gndi: <https://d-nb.info/gnd> | |||
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#> | |||
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/> | |||
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> | |||
PREFIX owl: <http://www.w3.org/2002/07/owl#> | |||
PREFIX dc: <http://purl.org/dc/terms/> | |||
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#> | |||
SELECT ?event ?eventId ?acronym ?variant ?name ?date ?areaCode ?place ?topic ?homepage | |||
WHERE { | |||
?event gnd:gndIdentifier ?eventId. | |||
OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?acronym. } | |||
OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?variant.} | |||
OPTIONAL { ?event gnd:preferredNameForTheConferenceOrEvent ?name.} | |||
OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?date. } | |||
OPTIONAL { ?event gnd:geographicAreaCode ?areaCode. } | |||
OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?place. } | |||
OPTIONAL { ?event gnd:topic ?topic. } | |||
OPTIONAL { ?event gnd:homepage ?homepage. } | |||
} | |||
#LIMIT 10000 | |||
</source> | |||
== Script to setup a Jena instance with GND data extract == | |||
=== gnd2jena === | |||
<source lang='bash'> | |||
#!/bin/bash | |||
# WF 2020-05-10 | |||
# global settings | |||
jena=apache-jena-4.4.0 | |||
tgz=$jena.tar.gz | |||
#mirror=https://downloads.apache.org/jena/binaries | |||
mirror=https://archive.apache.org/dist/jena/binaries | |||
jenaurl=$mirror/$tgz | |||
base=/hd/seel/gnd | |||
#base=/hd/torterra/gnd | |||
#base=/hd/luxio/gnd | |||
data=$base/data | |||
tdbloader=$jena/bin/tdb2.tdbloader | |||
getjena() { | |||
# download | |||
if [ ! -f $tgz ] | |||
then | |||
echo "downloading $tgz from $jenaurl" | |||
wget $jenaurl | |||
else | |||
echo "$tgz already downloaded" | |||
fi | |||
# unpack | |||
if [ ! -d $jena ] | |||
then | |||
echo "unpacking $jena from $tgz" | |||
tar xvzf $tgz | |||
else | |||
echo "$jena already unpacked" | |||
fi | |||
# create data directory | |||
if [ ! -d $data ] | |||
then | |||
echo "creating $data directory" | |||
mkdir -p $data | |||
else | |||
echo "$data directory already created" | |||
fi | |||
} | |||
# | |||
# show the given timestamp | |||
# | |||
timestamp() { | |||
local msg="$1" | |||
local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ") | |||
echo "$msg at $ts" | |||
} | |||
# | |||
# load data for the given data dir and input | |||
# | |||
loaddata() { | |||
local data="$1" | |||
local input="$2" | |||
timestamp "start loading $input to $data" | |||
$tdbloader --loc "$data" "$input" > tdb2-$phase-out.log 2> tdb2-$phase-err.log | |||
timestamp "finished loading $input to $data" | |||
} | |||
getjena | |||
export TMPDIR=$base/tmp | |||
if [ ! -d $TMPDIR ] | |||
then | |||
echo "creating temporary directory $TMPDIR" | |||
mkdir $TMPDIR | |||
else | |||
echo "using temporary directory $TMPDIR" | |||
fi | |||
for d in kongress geografikum | |||
do | |||
file=authorities-${d}_lds.ttl | |||
if [ ! -f $file ] | |||
then | |||
wget https://data.dnb.de/opendata/$file.gz | |||
gunzip $file.gz | |||
else | |||
echo "$file already downloaded" | |||
fi | |||
loaddata $data $file | |||
done | |||
</source> | |||
== try on confident23 server == | |||
=== 2020-07-19 === | |||
<source lang='bash'> | |||
wf@confident23:/usr/local/src$ sudo ./getjena | |||
apache-jena-3.16.0.tar.gz already downloaded | |||
apache-jena-3.16.0 already unpacked | |||
creating /var/data/gnd/data directory | |||
creating temporary directory /var/data/gnd/tmp | |||
start loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-07-19T07:08:15Z | |||
finished loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-07-19T07:11:16Z | |||
</source> | |||
=== 2020-09-15 === | |||
<source lang='bash'> | |||
./getjena | |||
apache-jena-3.16.0.tar.gz already downloaded | |||
apache-jena-3.16.0 already unpacked | |||
creating /var/data/gnd/data directory | |||
creating temporary directory /var/data/gnd/tmp | |||
--2020-09-15 11:46:37-- https://data.dnb.de/opendata/authorities-kongress_lds.ttl.gz | |||
Resolving data.dnb.de (data.dnb.de)... 193.175.100.140 | |||
Connecting to data.dnb.de (data.dnb.de)|193.175.100.140|:443... connected. | |||
HTTP request sent, awaiting response... 200 OK | |||
Length: 80381713 (77M) [application/x-gzip] | |||
Saving to: ‘authorities-kongress_lds.ttl.gz’ | |||
authorities-kongres 100%[===================>] 76.66M 66.3MB/s in 1.2s | |||
2020-09-15 11:46:38 (66.3 MB/s) - ‘authorities-kongress_lds.ttl.gz’ saved [80381713/80381713] | |||
start loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-09-15T09:46:42Z | |||
finished loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-09-15T09:50:25Z | |||
--2020-09-15 11:50:25-- https://data.dnb.de/opendata/authorities-geografikum_lds.ttl.gz | |||
Resolving data.dnb.de (data.dnb.de)... 193.175.100.140 | |||
Connecting to data.dnb.de (data.dnb.de)|193.175.100.140|:443... connected. | |||
HTTP request sent, awaiting response... 200 OK | |||
Length: 33842092 (32M) [application/x-gzip] | |||
Saving to: ‘authorities-geografikum_lds.ttl.gz’ | |||
authorities-geograf 100%[===================>] 32.27M 53.4MB/s in 0.6s | |||
2020-09-15 11:50:26 (53.4 MB/s) - ‘authorities-geografikum_lds.ttl.gz’ saved [33842092/33842092] | |||
start loading authorities-geografikum_lds.ttl to /var/data/gnd/data at 2020-09-15T09:50:27Z | |||
finished loading authorities-geografikum_lds.ttl to /var/data/gnd/data at 2020-09-15T09:52:40Z | |||
</source> | |||
== start fuseki == | |||
<source lang='bash'> | |||
#!/bin/bash | |||
# WF 2020-06-25 | |||
# Jena Fuseki server installation | |||
# see https://jena.apache.org/documentation/fuseki2/fuseki-run.html | |||
version=3.16.0 | |||
fuseki=apache-jena-fuseki-$version | |||
if [ ! -d $fuseki ] | |||
then | |||
if [ ! -f $fuseki.tar.gz ] | |||
then | |||
wget http://archive.apache.org/dist/jena/binaries/$fuseki.tar.gz | |||
else | |||
echo $fuseki.tar.gz already downloaded | |||
fi | |||
echo "unpacking $fuseki.tar.gz" | |||
tar xvfz $fuseki.tar.gz | |||
else | |||
echo $fuseki already downloaded and unpacked | |||
fi | |||
cd $fuseki | |||
gnddata=/var/data/gnd/data | |||
java -jar fuseki-server.jar --tdb2 --loc=$gnddata /gnd | |||
wf@confident23:/usr/local/src$ | |||
</source> | |||
Latest revision as of 08:24, 5 August 2022
| GlossaryEntry | |
|---|---|
| responsible | https://www.dnb.de/DE/Professionell/Standardisierung/GND/gnd.html |
| state | |
| since | 2012 |
| description | Gemeinsame Normdatei |
| references | |
| lang | de |
| master | GND |
Wikibase
Report "GND meets Wikibase" 2 Barbara Fischer
Links
- https://d-nb.info/standards/elementset/gnd
- https://d-nb.info/standards/elementset/gnd#SeriesOfConferenceOrEvent
- https://data.dnb.de/opendata/
- https://blog.lobid.org/2018/08/27/openrefine.html
GND property multiplicity
Based on analysis of https://data.dnb.de/opendata/authorities-kongress_lds.ttl.gz as of 2021-12
| property | gnd | total | unique | min | max | avg |
|---|---|---|---|---|---|---|
| eventId | gnd:gndIdentifier | 731651 | 731651 | 1 | 1 | 1 |
| title | gnd:preferredNameForTheConferenceOrEvent | 731645 | 731645 | 0 | 1 | 0.999991799 |
| acronym | gnd:abbreviatedNameForTheConferenceOrEvent | 3537 | 3206 | 0 | 4 | 0.00483 |
| sameAs | owl:sameAs | 769120 | 693077 | 0 | 20 | 1.05 |
| variant | gnd:variantNameForTheConferenceOrEvent | 632368 | 229268 | 0 | 41 | 0.86 |
| date | gnd:dateOfConferenceOrEvent | 710819 | 704949 | 0 | 9 | 0.971 |
| areaCode | gnd:geographicAreaCode | 797037 | 612631 | 0 | 11 | 1.089 |
| place | gnd:placeOfConferenceOrEvent | 659305 | 624667 | 0 | 18 | 0.901 |
| topic | gnd:topic | 5061 | 3520 | 0 | 6 | 0.00691 |
| homepage | gnd:homepage | 19011 | 18702 | 0 | 3 | 0.026 |
| prec | gnd:homepage | 12182 | 12106 | 0 | 3 | 0.0166 |
| succ | gnd:homepage | 11974 | 11929 | 0 | 3 | 0.0163 |
performance optimized query of GND event details
# performance optimized query of GND event details
# with aggregated properties as single, count and | separated list column
# WF 2021-12-05
PREFIX gndi: <https://d-nb.info/gnd>
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>
SELECT
?event
?eventId
(MIN(?eventTitle) as ?title)
(COUNT (DISTINCT ?eventDate) as ?dateCount)
(MIN(?eventDate) as ?date)
(MIN(?eventAcronym) as ?acronym)
(COUNT (DISTINCT ?eventAcronym) as ?acronymCount)
(GROUP_CONCAT(DISTINCT ?eventAcronym; SEPARATOR="| ") AS ?acronyms)
(MIN(?eventVariant) as ?variant)
(COUNT (DISTINCT ?eventVariant) as ?variantCount)
(GROUP_CONCAT(DISTINCT ?eventVariant; SEPARATOR="| ") AS ?variants)
(MIN(?eventPlace) as ?place)
(COUNT (DISTINCT ?eventPlace) as ?placeCount)
(GROUP_CONCAT(DISTINCT ?eventPlace; SEPARATOR="| ") AS ?places)
(MIN(?eventHomepage) as ?homepage)
WHERE {
?event a gnd:ConferenceOrEvent.
?event gnd:gndIdentifier ?eventId.
?event gnd:preferredNameForTheConferenceOrEvent ?eventTitle.
OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?eventAcronym. }
OPTIONAL { ?event gnd:homepage ?eventHomepage. }
OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?eventVariant. }
OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?eventDate. }
OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?eventPlace }
# only available 3520 times 2021-12
# ?event gnd:topic ?topic.
# only available 12106 times 2021-12
# ?event gnd:precedingConferenceOrEvent ?prec
# only available 11929 times 2021-12
#?event gnd:succeedingConferenceOrEvent ?succ
}
GROUP BY ?event ?eventId
query to analyze multiplicity
# get aggregate counts of property usage
# this query needs to be modified property by property and run twice
# once without the having clause and once with the having clause
# to create the table further down in this wikipage
# WF 2021-12-05
PREFIX gndi: <https://d-nb.info/gnd>
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>
SELECT (sum (?itemCount) as ?sum) (min (?itemCount) as ?min) (max (?itemCount) as ?max) (avg (?itemCount) as ?avg) {
SELECT ?event ?eventId (count(?title) as ?itemCount)
WHERE {
?event a gnd:ConferenceOrEvent.
?event gnd:gndIdentifier ?eventId.
OPTIONAL {
# ?event gnd:abbreviatedNameForTheConferenceOrEvent ?acronym.
# ?event owl:sameAs ?sameAs.
# ?event gnd:variantNameForTheConferenceOrEvent ?variant.
?event gnd:preferredNameForTheConferenceOrEvent ?title.
# ?event gnd:dateOfConferenceOrEvent ?date
# ?event gnd:geographicAreaCode ?areaCode.
# ?event gnd:placeOfConferenceOrEvent ?place.
# ?event gnd:topic ?topic.
# ?event gnd:homepage ?homepage.
# ?event gnd:precedingConferenceOrEvent ?prec
# ?event gnd:succeedingConferenceOrEvent ?succ
}
}
GROUP BY ?event ?eventId
#HAVING(COUNT(?title) = 1)
}
SPARQL Queries
entities and usage frequency
# get histogramm data of entities by
# usage frequency
# WF 2020-06-27
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
SELECT ?c (COUNT(?c) AS ?count)
WHERE {
?subject a ?c
}
GROUP BY ?c
HAVING (?count >100)
ORDER BY DESC(?count)
| c | count |
|---|---|
| gnd#ConferenceOrEvent | 713310 |
| gnd#TerritorialCorporateBodyOrAdministrativeUnit | 188246 |
| gnd#SeriesOfConferenceOrEvent | 122970 |
| gnd#BuildingOrMemorial | 67149 |
| http://www.opengis.net/ont/sf#Point | 57987 |
| gnd#PlaceOrGeographicName | 27771 |
| gnd#NaturalGeographicUnit | 20269 |
| gnd#AdministrativeUnit | 12846 |
| gnd#WayBorderOrLine | 4971 |
| gnd#ReligiousTerritory | 2646 |
| gnd#NameOfSmallGeographicUnitLyingWithinAnotherGeographicUnit | 2113 |
| gnd#CorporateBody | 559 |
| gnd#MemberState | 543 |
| gnd#Country | 307 |
| gnd#ExtraterrestrialTerritory | 282 |
| gnd#Language | 193 |
| gnd#ReligiousCorporateBody | 155 |
| gnd#ReligiousAdministrativeUnit | 134 |
| gnd#HistoricSingleEventOrEra | 128 |
relevance of fields
# get histogramm data of properties by
# usage frequency
# WF 2020-07-12
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>
SELECT ?property (COUNT(?property) AS ?propTotal)
WHERE { ?s ?property ?o . }
GROUP BY ?property
HAVING (?propTotal >1000)
ORDER BY DESC(?propTotal)
| property | propTotal |
|---|---|
| http://www.w3.org/2002/07/owl#sameAs | 1359115 |
| http://www.w3.org/1999/02/22-rdf-syntax-ns#type | 1222895 |
| gnd#geographicAreaCode | 1195102 |
| http://purl.org/dc/terms/license | 1149468 |
| http://purl.org/dc/terms/modified | 1149468 |
| http://www.w3.org/2007/05/powder-s#describedby | 1149468 |
| gnd#gndIdentifier | 1149468 |
| gnd#oldAuthorityNumber | 940550 |
| gnd#preferredNameForTheConferenceOrEvent | 836397 |
| gnd#variantNameForTheConferenceOrEvent | 720355 |
| gnd#dateOfConferenceOrEvent | 693196 |
| gnd#placeOfConferenceOrEvent | 650165 |
| gnd#preferredNameForThePlaceOrGeographicName | 313058 |
| gnd#variantNameForThePlaceOrGeographicName | 298383 |
| gnd#gndSubjectCategory | 176395 |
| gnd#definition | 157695 |
| gnd#broaderTermInstantial | 117334 |
| gnd#place | 82345 |
| http://xmlns.com/foaf/0.1/page | 74542 |
| http://www.opengis.net/ont/geosparql#asWKT | 58083 |
| http://www.opengis.net/ont/geosparql#hasGeometry | 58083 |
| https://d-nb.info/standards/elementset/dnb#deprecatedUri | 51576 |
| gnd#biographicalOrHistoricalInformation | 34190 |
| gnd#organizerOrHost | 34167 |
| gnd#relatedDdcWithDegreeOfDeterminacy2 | 25452 |
| gnd#succeedingPlaceOrGeographicName | 25208 |
| gnd#broaderTermPartitive | 25111 |
| gnd#dateOfEstablishment | 24772 |
| gnd#homepage | 24439 |
| gnd#precedingPlaceOrGeographicName | 22490 |
| gnd#dateOfTermination | 21582 |
| gnd#hierarchicalSuperiorOfPlaceOrGeographicName | 20522 |
| gnd#precedingConferenceOrEvent | 17164 |
| gnd#succeedingConferenceOrEvent | 16963 |
| gnd#dateOfProduction | 15106 |
| gnd#spatialAreaOfActivity | 13593 |
| gnd#hierarchicalSuperiorOfTheConferenceOrEvent | 10126 |
| gnd#architect | 8206 |
| gnd#topic | 6088 |
| gnd#relatedPlaceOrGeographicName | 5812 |
| gnd#abbreviatedNameForTheConferenceOrEvent | 5312 |
| gnd#complexSeeReferenceSubject | 3927 |
| gnd#startingOrFinalPointOfADistance | 3531 |
| gnd#relatedConferenceOrEvent | 3124 |
| gnd#relatedDdcWithDegreeOfDeterminacy4 | 2709 |
| gnd#relatedCorporateBody | 2354 |
| gnd#relatedTerm | 1728 |
| gnd#sponsorOrPatron | 1281 |
| gnd#relatedDdcWithDegreeOfDeterminacy3 | 1200 |
| gnd#exhibitor | 1044 |
| done | 2020-06-25"2020-06-25" is not recognized as a Boolean (true/false) value. |
|---|---|
| todo | Unfortunately the headlines and the forms seem to be mixed up. |
events with most often used fields and seldom but useful fields
# get events with most often used columns from GND
# plus acronym, topic, homepage (seldom but useful)
# WF 2020-07-12
PREFIX gndi: <https://d-nb.info/gnd>
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>
SELECT ?event ?eventId ?acronym ?variant ?name ?date ?areaCode ?place ?topic ?homepage
WHERE {
?event gnd:gndIdentifier ?eventId.
OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?acronym. }
OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?variant.}
OPTIONAL { ?event gnd:preferredNameForTheConferenceOrEvent ?name.}
OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?date. }
OPTIONAL { ?event gnd:geographicAreaCode ?areaCode. }
OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?place. }
OPTIONAL { ?event gnd:topic ?topic. }
OPTIONAL { ?event gnd:homepage ?homepage. }
}
#LIMIT 10000
Script to setup a Jena instance with GND data extract
gnd2jena
#!/bin/bash
# WF 2020-05-10
# global settings
jena=apache-jena-4.4.0
tgz=$jena.tar.gz
#mirror=https://downloads.apache.org/jena/binaries
mirror=https://archive.apache.org/dist/jena/binaries
jenaurl=$mirror/$tgz
base=/hd/seel/gnd
#base=/hd/torterra/gnd
#base=/hd/luxio/gnd
data=$base/data
tdbloader=$jena/bin/tdb2.tdbloader
getjena() {
# download
if [ ! -f $tgz ]
then
echo "downloading $tgz from $jenaurl"
wget $jenaurl
else
echo "$tgz already downloaded"
fi
# unpack
if [ ! -d $jena ]
then
echo "unpacking $jena from $tgz"
tar xvzf $tgz
else
echo "$jena already unpacked"
fi
# create data directory
if [ ! -d $data ]
then
echo "creating $data directory"
mkdir -p $data
else
echo "$data directory already created"
fi
}
#
# show the given timestamp
#
timestamp() {
local msg="$1"
local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
echo "$msg at $ts"
}
#
# load data for the given data dir and input
#
loaddata() {
local data="$1"
local input="$2"
timestamp "start loading $input to $data"
$tdbloader --loc "$data" "$input" > tdb2-$phase-out.log 2> tdb2-$phase-err.log
timestamp "finished loading $input to $data"
}
getjena
export TMPDIR=$base/tmp
if [ ! -d $TMPDIR ]
then
echo "creating temporary directory $TMPDIR"
mkdir $TMPDIR
else
echo "using temporary directory $TMPDIR"
fi
for d in kongress geografikum
do
file=authorities-${d}_lds.ttl
if [ ! -f $file ]
then
wget https://data.dnb.de/opendata/$file.gz
gunzip $file.gz
else
echo "$file already downloaded"
fi
loaddata $data $file
done
try on confident23 server
2020-07-19
wf@confident23:/usr/local/src$ sudo ./getjena
apache-jena-3.16.0.tar.gz already downloaded
apache-jena-3.16.0 already unpacked
creating /var/data/gnd/data directory
creating temporary directory /var/data/gnd/tmp
start loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-07-19T07:08:15Z
finished loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-07-19T07:11:16Z
2020-09-15
./getjena
apache-jena-3.16.0.tar.gz already downloaded
apache-jena-3.16.0 already unpacked
creating /var/data/gnd/data directory
creating temporary directory /var/data/gnd/tmp
--2020-09-15 11:46:37-- https://data.dnb.de/opendata/authorities-kongress_lds.ttl.gz
Resolving data.dnb.de (data.dnb.de)... 193.175.100.140
Connecting to data.dnb.de (data.dnb.de)|193.175.100.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80381713 (77M) [application/x-gzip]
Saving to: ‘authorities-kongress_lds.ttl.gz’
authorities-kongres 100%[===================>] 76.66M 66.3MB/s in 1.2s
2020-09-15 11:46:38 (66.3 MB/s) - ‘authorities-kongress_lds.ttl.gz’ saved [80381713/80381713]
start loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-09-15T09:46:42Z
finished loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-09-15T09:50:25Z
--2020-09-15 11:50:25-- https://data.dnb.de/opendata/authorities-geografikum_lds.ttl.gz
Resolving data.dnb.de (data.dnb.de)... 193.175.100.140
Connecting to data.dnb.de (data.dnb.de)|193.175.100.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 33842092 (32M) [application/x-gzip]
Saving to: ‘authorities-geografikum_lds.ttl.gz’
authorities-geograf 100%[===================>] 32.27M 53.4MB/s in 0.6s
2020-09-15 11:50:26 (53.4 MB/s) - ‘authorities-geografikum_lds.ttl.gz’ saved [33842092/33842092]
start loading authorities-geografikum_lds.ttl to /var/data/gnd/data at 2020-09-15T09:50:27Z
finished loading authorities-geografikum_lds.ttl to /var/data/gnd/data at 2020-09-15T09:52:40Z
start fuseki
#!/bin/bash
# WF 2020-06-25
# Jena Fuseki server installation
# see https://jena.apache.org/documentation/fuseki2/fuseki-run.html
version=3.16.0
fuseki=apache-jena-fuseki-$version
if [ ! -d $fuseki ]
then
if [ ! -f $fuseki.tar.gz ]
then
wget http://archive.apache.org/dist/jena/binaries/$fuseki.tar.gz
else
echo $fuseki.tar.gz already downloaded
fi
echo "unpacking $fuseki.tar.gz"
tar xvfz $fuseki.tar.gz
else
echo $fuseki already downloaded and unpacked
fi
cd $fuseki
gnddata=/var/data/gnd/data
java -jar fuseki-server.jar --tdb2 --loc=$gnddata /gnd
wf@confident23:/usr/local/src$