349 lines
11 KiB
Bash
349 lines
11 KiB
Bash
#!/bin/sh
|
|
|
|
# wcurl - a simple wrapper around curl to easily download files.
|
|
#
|
|
# Requires curl >= 7.46.0 (2015)
|
|
#
|
|
# Copyright (C) Samuel Henrique <samueloph@debian.org>, Sergio Durigan
|
|
# Junior <sergiodj@debian.org> and many contributors, see the AUTHORS
|
|
# file.
|
|
#
|
|
# Permission to use, copy, modify, and distribute this software for any purpose
|
|
# with or without fee is hereby granted, provided that the above copyright
|
|
# notice and this permission notice appear in all copies.
|
|
#
|
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN
|
|
# NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
|
|
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
|
|
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
|
|
# OR OTHER DEALINGS IN THE SOFTWARE.
|
|
#
|
|
# Except as contained in this notice, the name of a copyright holder shall not be
|
|
# used in advertising or otherwise to promote the sale, use or other dealings in
|
|
# this Software without prior written authorization of the copyright holder.
|
|
#
|
|
# SPDX-License-Identifier: curl
|
|
|
|
# Stop on errors and on usage of unset variables.
|
|
set -eu
|
|
|
|
VERSION="2025.11.04"
|
|
|
|
PROGRAM_NAME="$(basename "$0")"
|
|
readonly PROGRAM_NAME
|
|
|
|
# Display the version.
|
|
print_version()
|
|
{
|
|
cat << _EOF_
|
|
${VERSION}
|
|
_EOF_
|
|
}
|
|
|
|
# Display the program usage.
|
|
usage()
|
|
{
|
|
cat << _EOF_
|
|
${PROGRAM_NAME} -- a simple wrapper around curl to easily download files.
|
|
|
|
Usage: ${PROGRAM_NAME} <URL>...
|
|
${PROGRAM_NAME} [--curl-options <CURL_OPTIONS>]... [--no-decode-filename] [-o|-O|--output <PATH>] [--dry-run] [--] <URL>...
|
|
${PROGRAM_NAME} [--curl-options=<CURL_OPTIONS>]... [--no-decode-filename] [--output=<PATH>] [--dry-run] [--] <URL>...
|
|
${PROGRAM_NAME} -h|--help
|
|
${PROGRAM_NAME} -V|--version
|
|
|
|
Options:
|
|
|
|
--curl-options <CURL_OPTIONS>: Specify extra options to be passed when invoking curl. May be
|
|
specified more than once.
|
|
|
|
-o, -O, --output <PATH>: Use the provided output path instead of getting it from the URL. If
|
|
multiple URLs are provided, resulting files share the same name with a
|
|
number appended to the end (curl >= 7.83.0). If this option is provided
|
|
multiple times, only the last value is considered.
|
|
|
|
--no-decode-filename: Don't percent-decode the output filename, even if the percent-encoding in
|
|
the URL was done by wcurl, e.g.: The URL contained whitespace.
|
|
|
|
--dry-run: Don't actually execute curl, just print what would be invoked.
|
|
|
|
-V, --version: Print version information.
|
|
|
|
-h, --help: Print this usage message.
|
|
|
|
<CURL_OPTIONS>: Any option supported by curl can be set here. This is not used by wcurl; it is
|
|
instead forwarded to the curl invocation.
|
|
|
|
<URL>: URL to be downloaded. Anything that is not a parameter is considered
|
|
an URL. Whitespace is percent-encoded and the URL is passed to curl, which
|
|
then performs the parsing. May be specified more than once.
|
|
_EOF_
|
|
}
|
|
|
|
# Display an error message and bail out.
|
|
error()
|
|
{
|
|
printf "%s\n" "$*" > /dev/stderr
|
|
exit 1
|
|
}
|
|
|
|
# Extra curl options provided by the user.
|
|
# This is set per-URL for every URL provided.
|
|
# Some options are global, but we are erroring on the side of needlessly setting
|
|
# them multiple times instead of causing issues with parameters that needs to
|
|
# be set per-URL.
|
|
CURL_OPTIONS=""
|
|
|
|
# The URLs to be downloaded.
|
|
URLS=""
|
|
|
|
# Variable used to be set to the percent-decoded filename parsed from the URL, unless
|
|
# --output or --no-decode-filename are used.
|
|
OUTPUT_PATH=""
|
|
HAS_USER_SET_OUTPUT="false"
|
|
|
|
# The parameters that are passed per-URL to curl.
|
|
readonly PER_URL_PARAMETERS="\
|
|
--fail \
|
|
--globoff \
|
|
--location \
|
|
--proto-default https \
|
|
--remote-time \
|
|
--retry 5 "
|
|
|
|
# Valid percent-encode codes that are considered unsafe to be decoded.
|
|
# This is a list of space-separated percent-encoded uppercase
|
|
# characters.
|
|
# 2F = /
|
|
# 5C = \
|
|
readonly UNSAFE_PERCENT_ENCODE="2F 5C"
|
|
|
|
# Whether to invoke curl or not.
|
|
DRY_RUN="false"
|
|
|
|
# Sanitize parameters.
|
|
sanitize()
|
|
{
|
|
if [ -z "${URLS}" ]; then
|
|
error "You must provide at least one URL to download."
|
|
fi
|
|
|
|
readonly CURL_OPTIONS URLS DRY_RUN HAS_USER_SET_OUTPUT
|
|
}
|
|
|
|
# Indicate via exit code whether the string given in the first parameter
|
|
# consists solely of characters from the string given in the second parameter.
|
|
# In other words, it returns 0 if the first parameter only contains characters
|
|
# from the second parameter, e.g.: Are $1 characters a subset of $2 characters?
|
|
is_subset_of()
|
|
{
|
|
case "${1}" in
|
|
*[!${2}]* | '') return 1 ;;
|
|
esac
|
|
}
|
|
|
|
# Indicate via exit code whether the HTML code given in the first
|
|
# parameter is safe to be decoded.
|
|
is_safe_percent_encode()
|
|
{
|
|
upper_str=$(printf "%s" "${1}" | tr "[:lower:]" "[:upper:]")
|
|
for unsafe in ${UNSAFE_PERCENT_ENCODE}; do
|
|
if [ "${unsafe}" = "${upper_str}" ]; then
|
|
return 1
|
|
fi
|
|
done
|
|
|
|
return 0
|
|
}
|
|
|
|
# Print the given string percent-decoded.
|
|
percent_decode()
|
|
{
|
|
# Encodings of control characters (00-1F) are passed through without decoding.
|
|
# Iterate on the input character-by-character, decoding it.
|
|
printf "%s\n" "${1}" | fold -w1 | while IFS= read -r decode_out; do
|
|
# If character is a "%", read the next character as decode_hex1.
|
|
if [ "${decode_out}" = % ] && IFS= read -r decode_hex1; then
|
|
decode_out="${decode_out}${decode_hex1}"
|
|
# If there's one more character, read it as decode_hex2.
|
|
if IFS= read -r decode_hex2; then
|
|
decode_out="${decode_out}${decode_hex2}"
|
|
# Skip decoding if this is a control character (00-1F).
|
|
# Skip decoding if DECODE_FILENAME is not "true".
|
|
if [ "${DECODE_FILENAME}" = "true" ] \
|
|
&& is_subset_of "${decode_hex1}" "23456789abcdefABCDEF" \
|
|
&& is_subset_of "${decode_hex2}" "0123456789abcdefABCDEF" \
|
|
&& is_safe_percent_encode "${decode_out}"; then
|
|
# Use printf to decode it into octal and then decode it to the final format.
|
|
decode_out="$(printf "%b" "\\$(printf %o "0x${decode_hex1}${decode_hex2}")")"
|
|
fi
|
|
fi
|
|
fi
|
|
printf %s "${decode_out}"
|
|
done
|
|
}
|
|
|
|
# Print the percent-decoded filename portion of the given URL.
|
|
get_url_filename()
|
|
{
|
|
# Remove protocol and query string if present.
|
|
hostname_and_path="$(printf %s "${1}" | sed -e 's,^[^/]*//,,' -e 's,?.*$,,')"
|
|
# If what remains contains a slash, there's a path; return it percent-decoded.
|
|
case "${hostname_and_path}" in
|
|
# sed to remove everything preceding the last '/', e.g.: "example/something" becomes "something"
|
|
*/*) percent_decode "$(printf %s "${hostname_and_path}" | sed -e 's,^.*/,,')" ;;
|
|
esac
|
|
# No slash means there was just a hostname and no path; return empty string.
|
|
}
|
|
|
|
# Execute curl with the list of URLs provided by the user.
|
|
exec_curl()
|
|
{
|
|
CMD="curl "
|
|
|
|
# Store version to check if it supports --no-clobber, --parallel and --parallel-max-host.
|
|
curl_version=$($CMD --version | cut -f2 -d' ' | head -n1)
|
|
curl_version_major=$(echo "$curl_version" | cut -f1 -d.)
|
|
curl_version_minor=$(echo "$curl_version" | cut -f2 -d.)
|
|
|
|
CURL_NO_CLOBBER=""
|
|
CURL_PARALLEL=""
|
|
# --no-clobber is only supported since 7.83.0.
|
|
# --parallel is only supported since 7.66.0.
|
|
# --parallel-max-host is only supported since 8.16.0.
|
|
if [ "${curl_version_major}" -ge 8 ]; then
|
|
CURL_NO_CLOBBER="--no-clobber"
|
|
CURL_PARALLEL="--parallel"
|
|
if [ "${curl_version_minor}" -ge 16 ]; then
|
|
CURL_PARALLEL="--parallel --parallel-max-host 5"
|
|
fi
|
|
elif [ "${curl_version_major}" -eq 7 ]; then
|
|
if [ "${curl_version_minor}" -ge 83 ]; then
|
|
CURL_NO_CLOBBER="--no-clobber"
|
|
fi
|
|
if [ "${curl_version_minor}" -ge 66 ]; then
|
|
CURL_PARALLEL="--parallel"
|
|
fi
|
|
fi
|
|
|
|
# Detecting whether we need --parallel. It's easier to rely on
|
|
# the shell's argument parsing.
|
|
# shellcheck disable=SC2086
|
|
set -- $URLS
|
|
|
|
# If there are less than two URLs, don't set the parallel flag.
|
|
if [ "$#" -lt 2 ]; then
|
|
CURL_PARALLEL=""
|
|
fi
|
|
|
|
# Start assembling the command.
|
|
#
|
|
# We use 'set --' here (again) because (a) we don't have arrays on
|
|
# POSIX shell, and (b) we need better control over the way we
|
|
# split arguments.
|
|
#
|
|
# shellcheck disable=SC2086
|
|
set -- ${CMD} ${CURL_PARALLEL}
|
|
|
|
NEXT_PARAMETER=""
|
|
for url in ${URLS}; do
|
|
# If the user did not provide an output path, define one.
|
|
if [ "${HAS_USER_SET_OUTPUT}" = "false" ]; then
|
|
OUTPUT_PATH="$(get_url_filename "${url}")"
|
|
# If we could not get a path from the URL, use the default: index.html.
|
|
[ -z "${OUTPUT_PATH}" ] && OUTPUT_PATH=index.html
|
|
fi
|
|
# shellcheck disable=SC2086
|
|
set -- "$@" ${NEXT_PARAMETER} ${PER_URL_PARAMETERS} ${CURL_NO_CLOBBER} --output "${OUTPUT_PATH}" ${CURL_OPTIONS} "${url}"
|
|
NEXT_PARAMETER="--next"
|
|
done
|
|
|
|
if [ "${DRY_RUN}" = "false" ]; then
|
|
exec "$@"
|
|
else
|
|
printf "%s\n" "$@"
|
|
fi
|
|
}
|
|
|
|
# Default to decoding the output filename
|
|
DECODE_FILENAME="true"
|
|
|
|
# Use "${1-}" in order to avoid errors because of 'set -u'.
|
|
while [ -n "${1-}" ]; do
|
|
case "${1}" in
|
|
--curl-options=*)
|
|
opt=$(printf "%s\n" "${1}" | sed 's/^--curl-options=//')
|
|
CURL_OPTIONS="${CURL_OPTIONS} ${opt}"
|
|
;;
|
|
|
|
--curl-options)
|
|
shift
|
|
CURL_OPTIONS="${CURL_OPTIONS} ${1}"
|
|
;;
|
|
|
|
--dry-run)
|
|
DRY_RUN="true"
|
|
;;
|
|
|
|
--output=*)
|
|
opt=$(printf "%s\n" "${1}" | sed 's/^--output=//')
|
|
HAS_USER_SET_OUTPUT="true"
|
|
OUTPUT_PATH="${opt}"
|
|
;;
|
|
|
|
-o | -O | --output)
|
|
shift
|
|
HAS_USER_SET_OUTPUT="true"
|
|
OUTPUT_PATH="${1}"
|
|
;;
|
|
|
|
-o* | -O*)
|
|
opt=$(printf "%s\n" "${1}" | sed 's/^-[oO]//')
|
|
HAS_USER_SET_OUTPUT="true"
|
|
OUTPUT_PATH="${opt}"
|
|
;;
|
|
|
|
--no-decode-filename)
|
|
DECODE_FILENAME="false"
|
|
;;
|
|
|
|
-h | --help)
|
|
usage
|
|
exit 0
|
|
;;
|
|
|
|
-V | --version)
|
|
print_version
|
|
exit 0
|
|
;;
|
|
|
|
--)
|
|
# This is the start of the list of URLs.
|
|
shift
|
|
for url in "$@"; do
|
|
# Encode whitespace into %20, since wget supports those URLs.
|
|
newurl=$(printf "%s\n" "${url}" | sed 's/ /%20/g')
|
|
URLS="${URLS} ${newurl}"
|
|
done
|
|
break
|
|
;;
|
|
|
|
-*)
|
|
error "Unknown option: '$1'."
|
|
;;
|
|
|
|
*)
|
|
# This must be a URL.
|
|
# Encode whitespace into %20, since wget supports those URLs.
|
|
newurl=$(printf "%s\n" "${1}" | sed 's/ /%20/g')
|
|
URLS="${URLS} ${newurl}"
|
|
;;
|
|
esac
|
|
shift
|
|
done
|
|
|
|
sanitize
|
|
exec_curl
|