Not one of my own, rather a colleague came up with the following to improve the speed of my slow while loop. It takes advantage of the parallel command by turning the task for extracting the paths and making the actual copy into a function which can then be farmed out to multiple processes, taking input from the same file.

#!/bin/bash
#
# Description : This script takes a single file that lists loadable sources on s3 and exports
#               the compiled text files to the directory in which it is invoked, preserving
#               the scope/user_id/imei directory structure.
#               The copy is parallelised with the GNU command "parallel". The number of processors
#               used for the copy is 16 by default and can be modified with the -j option.
#               See https://www.gnu.org/software/parallel/ for more information.
#               Note that parallel can be installed in a conda environment with:
#               conda install parallel
#
# 
set -e
set -o pipefail
 
usage()
{
    echo "usage: aws-export.sh [ -f LOADABLE_SOURCE_FILE ] [ -o OUTPUT_DIR ] [ -j JOBS ]"
}
 
while getopts 'f:o:j:?h' option
do
    case $option in
	f)
	    LOADABLE_SRC=${OPTARG};;
	o)
	    OUTPUT_DIR=${OPTARG};;
    j)
        JOBS=${OPTARG};;
    h|?) 
        usage
        exit 0;;
    esac
done
 
echo "########################################"
echo "Exporting compiled text files from AWS"
echo "########################################"
echo "Date : $(date)"
echo "Host : $(hostname)"
echo "########################################"
echo ""
echo ""
 
if [ -z "$LOADABLE_SRC" ]; then
    echo "You have not specified a file that lists compiled text with the '-f' flag."
    echo "Please enter the path to such a file : "
    read -r LOADABLE_SRC
fi
 
if [ -z "$OUTPUT_DIR" ]; then
    OUTPUT_DIR='.'
    echo "########################################"
    echo "Exporting files to current directory"
    echo "########################################"
    echo ""
fi
 
if [ -z "$JOBS" ]; then
    JOBS=16
fi
echo "*** Parallel execution using $JOBS CPU threads ***"
echo ""
 
aws_copy()
{
    echo "Copying file $1"
    SUB_DIR=$(echo "$1" | awk '{split($0, path, "/"); print path[4]"/"path[5]"/"path[6]"/"}')
    echo "Target is : $2/$SUB_DIR"
    aws s3 cp "$1" "$2/$SUB_DIR" 2>/dev/null ; \
    echo "########################################"
}
 
# Set timer to zero
SECONDS=0
 
export -f aws_copy
parallel -j $JOBS aws_copy {} $OUTPUT_DIR < "$LOADABLE_SRC"
 
DURATION=$SECONDS
echo ""
echo ""
echo "Export took       : $DURATION seconds"
echo "Files exported to : $OUTPUT_DIR"
echo "########################################"
bash/scripts/aws_exports_parallel.txt · Last modified: 2021/03/20 19:21 by 127.0.0.1
CC Attribution-Share Alike 4.0 International
Driven by DokuWiki Recent changes RSS feed Valid CSS Valid XHTML 1.0