Difference between revisions of "WikiData Import 2020-08-15"

From BITPlan Wiki
Jump to navigation Jump to search
Line 14: Line 14:
 
= Summary =
 
= Summary =
 
= Download and unpack =
 
= Download and unpack =
 +
Downloading took some 7 h
 +
unpacking took some 12h 30
 
<source lang='bash' highlight='1,14'>
 
<source lang='bash' highlight='1,14'>
 
date;wget https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.nt.bz2;date
 
date;wget https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.nt.bz2;date
Line 30: Line 32:
 
date;bzip2 -dk latest-all.nt.bz2;date
 
date;bzip2 -dk latest-all.nt.bz2;date
 
Sun Aug 16 07:02:30 CEST 2020
 
Sun Aug 16 07:02:30 CEST 2020
 +
Sun Aug 16 19:37:34 CEST 2020
 
</source>
 
</source>
 +
 
= Scripts =
 
= Scripts =
 
== check extract speed ==
 
== check extract speed ==

Revision as of 19:52, 16 August 2020

Environment

  1. Mac Pro Mid 2010
  2. 12 core 3.46 GHz
  3. 64 GB RAM
  4. macOS High Sierra 10.13.6
  5. Source Disk: 4 TB 7200 rpm hard disk WD Gold WDC WD4002FYYZ Blackmagic speed rating: 175 MB/s write 175 MB/s read
  6. Target Disk: 4 TB SSD Samsung 860 EVO Blackmagic speed rating: 257 MB/s write 270 MB/s read
  7. java -version
    openjdk version "11.0.5" 2019-10-15
    OpenJDK Runtime Environment AdoptOpenJDK (build 11.0.5+10)
    OpenJDK 64-Bit Server VM AdoptOpenJDK (build 11.0.5+10, mixed mode)
    

Summary

Download and unpack

Downloading took some 7 h unpacking took some 12h 30

date;wget https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.nt.bz2;date
Sat Aug 15 17:47:23 CEST 2020
--2020-08-15 17:47:24--  https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.nt.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 2620::861:1:208:80:154:7, 208.80.154.7
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|2620::861:1:208:80:154:7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 119725690213 (112G) [application/octet-stream]
Saving to: ‘latest-all.nt.bz2’
latest-all.nt.bz2           0%[                                    ]  34.15M  4.61MB/s    eta 6h 58m 
...
latest-all.nt.bz2         100%[===================================>] 111.50G  4.88MB/s    in  6h 49m  
2020-08-16 00:37:06 (4.64 MB/s) - ‘latest-all.nt.bz2’ saved [119725690213/119725690213]
Sun Aug 16 00:37:06 CEST 2020
date;bzip2 -dk latest-all.nt.bz2;date
Sun Aug 16 07:02:30 CEST 2020
Sun Aug 16 19:37:34 CEST 2020

Scripts

check extract speed

./speed 
149 MB in 3 s =  49 MB/s = 172 GB/h
315/2014 GB (15 %) todo: 1699 GB
ETA 9.9 h

speed

#!/bin/bash
# WF 2020-08-16
# check extract speed

#
# check the speed of extraction
#
checkSpeed() {
  local l_secs="$1"
	local l_total="$2"
	totalgb=$(expr $l_total / 1024 / 1024 / 1024)
  first=$(du -sm latest-all.nt | cut -f1)
  sleep $l_secs 
  second=$(du -sm latest-all.nt | cut -f1)
  mbytes=$(expr $second - $first)
	mpersec=$(expr $mbytes / $l_secs)
	gperh=$(expr $mpersec \* 3600 / 1024)
	echo "$mbytes MB in $l_secs s =  $mpersec MB/s = $gperh GB/h"
	donegb=$(expr $second / 1024)
	todogb=$(expr $totalgb - $donegb)
	percent=$(expr $donegb \* 100 / $totalgb)
	eta=$(expr $todogb / $gperh)
	echo "$donegb/$totalgb GB ($percent %) todo: $todogb GB"
	awk -v todo=$todogb -v gperh=$gperh 'BEGIN { printf("ETA %.1f h\n",todo/gperh) }'
}

checkSpeed 3 2162713035569