Not one of my own, rather a colleague came up with the following to improve the speed of my slow while
loop. It takes advantage of the parallel
command by turning the task for extracting the paths and making the actual copy into a function which can then be farmed out to multiple processes, taking input from the same file.
#!/bin/bash # # Description : This script takes a single file that lists loadable sources on s3 and exports # the compiled text files to the directory in which it is invoked, preserving # the scope/user_id/imei directory structure. # The copy is parallelised with the GNU command "parallel". The number of processors # used for the copy is 16 by default and can be modified with the -j option. # See https://www.gnu.org/software/parallel/ for more information. # Note that parallel can be installed in a conda environment with: # conda install parallel # # set -e set -o pipefail usage() { echo "usage: aws-export.sh [ -f LOADABLE_SOURCE_FILE ] [ -o OUTPUT_DIR ] [ -j JOBS ]" } while getopts 'f:o:j:?h' option do case $option in f) LOADABLE_SRC=${OPTARG};; o) OUTPUT_DIR=${OPTARG};; j) JOBS=${OPTARG};; h|?) usage exit 0;; esac done echo "########################################" echo "Exporting compiled text files from AWS" echo "########################################" echo "Date : $(date)" echo "Host : $(hostname)" echo "########################################" echo "" echo "" if [ -z "$LOADABLE_SRC" ]; then echo "You have not specified a file that lists compiled text with the '-f' flag." echo "Please enter the path to such a file : " read -r LOADABLE_SRC fi if [ -z "$OUTPUT_DIR" ]; then OUTPUT_DIR='.' echo "########################################" echo "Exporting files to current directory" echo "########################################" echo "" fi if [ -z "$JOBS" ]; then JOBS=16 fi echo "*** Parallel execution using $JOBS CPU threads ***" echo "" aws_copy() { echo "Copying file $1" SUB_DIR=$(echo "$1" | awk '{split($0, path, "/"); print path[4]"/"path[5]"/"path[6]"/"}') echo "Target is : $2/$SUB_DIR" aws s3 cp "$1" "$2/$SUB_DIR" 2>/dev/null ; \ echo "########################################" } # Set timer to zero SECONDS=0 export -f aws_copy parallel -j $JOBS aws_copy {} $OUTPUT_DIR < "$LOADABLE_SRC" DURATION=$SECONDS echo "" echo "" echo "Export took : $DURATION seconds" echo "Files exported to : $OUTPUT_DIR" echo "########################################"