Pdf2wordcloud
Jump to navigation
Jump to search
Example usage
E.g. download all your overleaf pdf files ... Use a time constraint (e.g. 4 hours) and a directory for selecting files
pdf2wordcloud $HOME/Downloads 4 h
Example result
Script
#!/bin/bash
# Author: Wolfgang Fahl
# Created with assistance from OpenAI's ChatGPT
# Date: 2024-06-25
# Description: Generates a word cloud from PDF files in a specified directory that were modified within a given time frame.
# Usage: ./pdf2wordcloud <directory> <time value> <time unit>
# Example: ./pdf2wordcloud downloads 2 h
# ANSI color codes for colored output
red='\033[0;31m'
green='\033[0;32m'
endColor='\033[0m'
# Function to print messages with color
color_msg() {
local l_color="$1"
local l_msg="$2"
echo -e "${l_color}$l_msg${endColor}"
}
# Function to show error messages and exit
error() {
local l_msg="$1"
color_msg $red "Error:"
color_msg $red "\t$l_msg"
exit 1
}
# Function to show success messages
success() {
local l_msg="$1"
color_msg $green "$l_msg"
}
# Function to automatically install required programs
autoinstall() {
local l_prog=$1
local l_linuxpackage=$2
local l_macospackage=$3
local os=$(uname)
which $l_prog >/dev/null 2>&1 || {
case $os in
Darwin)
color_msg $red "Installing $l_prog from MacPorts package $l_macospackage"
sudo port install $l_macospackage
;;
Linux)
color_msg $red "Installing $l_prog from apt-package $l_linuxpackage"
sudo apt-get install -y $l_linuxpackage
;;
*)
error "Unsupported operating system $os"
;;
esac
}
}
# Function to automatically install required Python packages via pip
pip_autoinstall() {
local l_package=$1
pip show $l_package > /dev/null 2>&1 || {
color_msg $red "Installing Python package $l_package"
pip install $l_package
}
}
# Check and install necessary tools
autoinstall pdftotext poppler-utils poppler
pip_autoinstall wordcloud
# Display usage information
usage() {
color_msg $red "Usage:"
color_msg $green "\t$0 <directory> <time value> <time unit>"
color_msg $green "\t-h, --help: Show help."
exit 1
}
# Parse command line options
if [ "$#" -eq 0 ] || [[ "$1" == "-h" ]] || [[ "$1" == "--help" ]]; then
usage
fi
if [ $# -ne 3 ]; then
usage
fi
DIRECTORY=$1
TIME_VALUE=$2
TIME_UNIT=$3
# Validate and calculate the time unit
MOD_TIME=""
case $TIME_UNIT in
m)
MOD_TIME="-${TIME_VALUE}"
;;
h)
# Convert hours to minutes
let TIME_VALUE=TIME_VALUE*60
MOD_TIME="-${TIME_VALUE}"
;;
*)
error "Invalid time unit. Use 'm' for minutes or 'h' for hours."
esac
# Find and process PDF files
find "$DIRECTORY" -type f -name '*.pdf' -mmin "$MOD_TIME" -exec pdftotext {} - \; >> $TEMP_TEXT
# Check if any text was extracted
if [ ! -s $TEMP_TEXT ]; then
rm $TEMP_TEXT
error "No PDF files found that were modified within the last $TIME_VALUE $TIME_UNIT."
fi
# Temporary file for text extraction
TEMP_TEXT=$(mktemp)
# Find and process PDF files
find "$DIRECTORY" -type f -name '*.pdf' -mmin "$MOD_TIME" -exec pdftotext {} - \; >> $TEMP_TEXT
# Check if any text was extracted
if [ ! -s $TEMP_TEXT ]; then
rm $TEMP_TEXT
error "No PDF files found that were modified within the last $TIME_VALUE $TIME_UNIT."
fi
# Generate the word cloud
wordcloud_cli --text $TEMP_TEXT --imagefile "${DIRECTORY}/wordcloud.png"
# Cleanup and success message
rm $TEMP_TEXT
success "Word cloud generated successfully and saved to ${DIRECTORY}/wordcloud.png."