#!/bin/bash # # Initialisation and basic checks # # Change directory to working directory (Loghi scripts) cd "$(dirname "$0")" MIBUDERA_CLIENT_DIRECTORY="../mibudera_client" MIBUDERA_CLIENT_PAGES_DIRECTORY="$MIBUDERA_CLIENT_DIRECTORY/page" # Check if jq is installed if ! command -v jq &> /dev/null then echo "ERROR: jq could not be found. Please install it." exit 1 fi # Check if curl is installed if ! command -v curl &> /dev/null then echo "ERROR: curl could not be found. Please install it." exit 1 fi # Source the client.env file if [ -f "mibudera_client.env" ]; then source mibudera_client.env else echo "ERROR: The file mibudera_client.env file not found. See https://mibudera.coret.org/howto.php for instructions." exit 1 fi # Now the variables are available if [ ! -n "$MIBUDERA_CLIENT_IDENTIFIER" ] || [ ! -n "$MIBUDERA_CLIENT_KEY" ]; then echo "ERROR: One or both variables (MIBUDERA_CLIENT_IDENTIFIER/MIBUDERA_CLIENT_KEY) are not set in mibudera_client.env. See https://mibudera.coret.org/howto.php for instructions." exit 1 fi # Number of images to fetch per batch (default to 25) - can be a parameter in mibudera_client.env! if [ ! -n "$MIBUDERA_BATCH_SIZE" ]; then MIBUDERA_BATCH_SIZE=25 fi # Make work directory for images and page XML files mkdir -p $MIBUDERA_CLIENT_DIRECTORY mkdir -p $MIBUDERA_CLIENT_PAGES_DIRECTORY # Cleanup stuff from previous runs rm $MIBUDERA_CLIENT_DIRECTORY/* 2>/dev/null rm $MIBUDERA_CLIENT_PAGES_DIRECTORY/* 2>/dev/null # Get some config info from the inference-pipeline.sh script LAYPABASELINEMODELWEIGHTS=$(grep '^LAYPABASELINEMODELWEIGHTS=' inference-pipeline.sh | awk -F'=' '{print $2}' | awk -F'/' '{print $NF}') HTRLOGHIMODEL=$(grep '^HTRLOGHIMODEL=' inference-pipeline.sh | awk -F'=' '{print $2}') HTRLOGHIMODEL=${HTRLOGHIMODEL%/} # Remove trailing slash if present HTRLOGHIMODEL=${HTRLOGHIMODEL##*/} # Extract the last directory name # # Fetch and Store functions # function mibudera_tracker_fetch { for ((run = 1; run <= $MIBUDERA_BATCH_SIZE; run++)); do JSON_FILENAME=$MIBUDERA_CLIENT_DIRECTORY/$(date +%Y%m%d%H%M%S).json curl -s -X 'GET' \ 'https://mibudera.coret.org/api/v0/page' \ -H "Accept: application/json" \ -H "X-MIBUDERA-CLIENT-IDENTIFIER: $MIBUDERA_CLIENT_IDENTIFIER" \ -H "X-MIBUDERA-CLIENT-KEY: $MIBUDERA_CLIENT_KEY" \ -o $JSON_FILENAME # Extract image_url and guid using jq image_url=$(jq -r '.image_url' $JSON_FILENAME) guid=$(jq -r '.guid' $JSON_FILENAME) # Download the image using curl if [[ -n "$image_url" && -n "$guid" ]]; then curl -s -o "$MIBUDERA_CLIENT_DIRECTORY/$guid.jpg" "$image_url" sleep 1 # be nice to archive's infrastructure if [ $? -eq 0 ]; then echo "INFO: image $image_url downloaded successfully as $MIBUDERA_CLIENT_DIRECTORY/$guid.jpg" else echo "ERROR: Could not download image $image_url" fi else echo "ERROR: image_url or guid is empty." fi rm $JSON_FILENAME done } function mibudera_tracker_store { # Check if the directory exists if [ -d $MIBUDERA_CLIENT_PAGES_DIRECTORY ]; then # Iterate over all XML files in the directory for xml_file in $MIBUDERA_CLIENT_PAGES_DIRECTORY/*.xml; do if [ "$xml_file" != "$MIBUDERA_CLIENT_PAGES_DIRECTORY/*.xml" ]; then # Check if the file exists (important for handling cases where no .xml files exist) if [ -f "$xml_file" ]; then # Extract filename without path and extension guid=$(basename "$xml_file" .xml) # Get page xml into variable xml_content=$(< "$xml_file") # Escape special characters for JSON escaped_xml=$(echo "$xml_content" | jq -Rs .) # Store JSON to POST echo "{\"guid\": \"$guid\",\"htr_model\":\"$LAYPABASELINEMODELWEIGHTS\",\"laypa_model\":\"$HTRLOGHIMODEL\",\"pagexml\": $escaped_xml}" > "$MIBUDERA_CLIENT_PAGES_DIRECTORY/$guid.json" # POST transcription response=$(curl -s -X 'POST' 'https://mibudera.coret.org/api/v0/transcription' \ -H "X-MIBUDERA-CLIENT-IDENTIFIER: $MIBUDERA_CLIENT_IDENTIFIER" \ -H "X-MIBUDERA-CLIENT-KEY: $MIBUDERA_CLIENT_KEY" \ -H 'Content-Type: application/json' --data-binary @"$MIBUDERA_CLIENT_PAGES_DIRECTORY/$guid.json" ) echo "INFO: transcription $guid successfully stored at Mibudera Tracker" # Cleanup files rm $MIBUDERA_CLIENT_DIRECTORY/$guid.* rm $MIBUDERA_CLIENT_PAGES_DIRECTORY/$guid.* else echo "ERROR: File '$xml_file' not found." exit 1 fi else echo "ERROR: No transcriptions found in $MIBUDERA_CLIENT_PAGES_DIRECTORY" fi done else echo "ERROR: Directory $MIBUDERA_CLIENT_PAGES_DIRECTORY not found." exit 1 fi } # Main Mibudera Client while : do echo ' _____ ._____. .___ _________ .__ .__ __ ' echo ' / \ |__\_ |__ __ __ __| _/________________ \_ ___ \| | |__| ____ _____/ |_ ' echo ' / \ / \| || __ \| | \/ __ |/ __ \_ __ \__ \ / \ \/| | | |/ __ \ / \ __\' echo '/ Y \ || \_\ \ | / /_/ \ ___/| | \// __ \_ \ \___| |_| \ ___/| | \ | ' echo '\____|__ /__||___ /____/\____ |\___ >__| (____ / \______ /____/__|\___ >___| /__| ' echo ' \/ \/ \/ \/ \/ \/ \/ \/ v 0.6' echo '' echo 'Track the progress of your Mibudera Client (ID: '$MIBUDERA_CLIENT_IDENTIFIER')' echo 'via the public leaderboard via https://mibudera.coret.org/leaderboard' echo '' echo 'Press [CTRL+C] to stop the Mibudera Client' echo '' echo '(1) Fetching '$MIBUDERA_BATCH_SIZE' image URLs from the Mibudera Tracker and download the images from the source' echo '' mibudera_tracker_fetch echo '' echo '(2) Making the transcriptions with Loghi (inference-pipeline.sh '$MIBUDERA_CLIENT_DIRECTORY')' echo '' ./inference-pipeline.sh $MIBUDERA_CLIENT_DIRECTORY if [ $? -ne 0 ]; then echo "Error: inference-pipeline.sh failed with exit code $? - stopping the Mibudera Client" exit 1 fi echo '' echo '(3) Storing the transcriptions to the Mibudera Tracker' echo '' mibudera_tracker_store echo '' if [ "$1" = "onerun" ]; then # To test the Mibudera Client and make it do only one batch, start use ./mibudera_client.sh onerun exit 0 # Exit with success code (0) fi done