Difference between revisions of "Pdf2wordcloud"

From BITPlan Wiki
Jump to navigation Jump to search
 
(One intermediate revision by the same user not shown)
Line 5: Line 5:
 
pdf2wordcloud $HOME/Downloads 4 h
 
pdf2wordcloud $HOME/Downloads 4 h
 
</source>
 
</source>
 +
== Example result ==
 +
[[File:wordcloud.png|600px]]
 +
 
= Script =
 
= Script =
 
<source lang='bash'>
 
<source lang='bash'>

Latest revision as of 10:28, 25 June 2024

Example usage

E.g. download all your overleaf pdf files ... Use a time constraint (e.g. 4 hours) and a directory for selecting files

pdf2wordcloud $HOME/Downloads 4 h

Example result

Wordcloud.png

Script

#!/bin/bash
# Author: Wolfgang Fahl
# Created with assistance from OpenAI's ChatGPT
# Date: 2024-06-25
# Description: Generates a word cloud from PDF files in a specified directory that were modified within a given time frame.
# Usage: ./pdf2wordcloud <directory> <time value> <time unit>
# Example: ./pdf2wordcloud downloads 2 h

# ANSI color codes for colored output
red='\033[0;31m'
green='\033[0;32m'
endColor='\033[0m'

# Function to print messages with color
color_msg() {
  local l_color="$1"
  local l_msg="$2"
  echo -e "${l_color}$l_msg${endColor}"
}

# Function to show error messages and exit
error() {
  local l_msg="$1"
  color_msg $red "Error:"
  color_msg $red "\t$l_msg"
  exit 1
}

# Function to show success messages
success() {
  local l_msg="$1"
  color_msg $green "$l_msg"
}

# Function to automatically install required programs
autoinstall() {
  local l_prog=$1
  local l_linuxpackage=$2
  local l_macospackage=$3
  local os=$(uname)
  which $l_prog >/dev/null 2>&1 || {
    case $os in
      Darwin)
        color_msg $red "Installing $l_prog from MacPorts package $l_macospackage"
        sudo port install $l_macospackage
        ;;
      Linux)
        color_msg $red "Installing $l_prog from apt-package $l_linuxpackage"
        sudo apt-get install -y $l_linuxpackage
        ;;
      *)
        error "Unsupported operating system $os"
        ;;
    esac
  }
}

# Function to automatically install required Python packages via pip
pip_autoinstall() {
  local l_package=$1
  pip show $l_package > /dev/null 2>&1 || {
    color_msg $red "Installing Python package $l_package"
    pip install $l_package
  }
}

# Check and install necessary tools
autoinstall pdftotext poppler-utils poppler
pip_autoinstall wordcloud

# Display usage information
usage() {
  color_msg $red "Usage:"
  color_msg $green "\t$0 <directory> <time value> <time unit>"
  color_msg $green "\t-h, --help: Show help."
  exit 1
}

# Parse command line options
if [ "$#" -eq 0 ] || [[ "$1" == "-h" ]] || [[ "$1" == "--help" ]]; then
  usage
fi

if [ $# -ne 3 ]; then
  usage
fi

DIRECTORY=$1
TIME_VALUE=$2
TIME_UNIT=$3

# Validate and calculate the time unit
MOD_TIME=""
case $TIME_UNIT in
  m)
    MOD_TIME="-${TIME_VALUE}"
    ;;
  h)
    # Convert hours to minutes
    let TIME_VALUE=TIME_VALUE*60
    MOD_TIME="-${TIME_VALUE}"
    ;;
  *)
    error "Invalid time unit. Use 'm' for minutes or 'h' for hours."
esac

# Find and process PDF files
find "$DIRECTORY" -type f -name '*.pdf' -mmin "$MOD_TIME" -exec pdftotext {} - \; >> $TEMP_TEXT

# Check if any text was extracted
if [ ! -s $TEMP_TEXT ]; then
  rm $TEMP_TEXT
  error "No PDF files found that were modified within the last $TIME_VALUE $TIME_UNIT."
fi


# Temporary file for text extraction
TEMP_TEXT=$(mktemp)

# Find and process PDF files
find "$DIRECTORY" -type f -name '*.pdf' -mmin "$MOD_TIME" -exec pdftotext {} - \; >> $TEMP_TEXT

# Check if any text was extracted
if [ ! -s $TEMP_TEXT ]; then
  rm $TEMP_TEXT
  error "No PDF files found that were modified within the last $TIME_VALUE $TIME_UNIT."
fi

# Generate the word cloud
wordcloud_cli --text $TEMP_TEXT --imagefile "${DIRECTORY}/wordcloud.png"

# Cleanup and success message
rm $TEMP_TEXT
success "Word cloud generated successfully and saved to ${DIRECTORY}/wordcloud.png."