Rework scraping method
This commit is contained in:
parent
f7fc2ef0eb
commit
dbf448bc77
@ -2,8 +2,13 @@
|
|||||||
# South Park Downloader Configuration File #
|
# South Park Downloader Configuration File #
|
||||||
############################################
|
############################################
|
||||||
|
|
||||||
|
# Relative or absolute paths are allowed. Please DON'T use ~/, but rather /$HOME or /home/$USER.
|
||||||
|
|
||||||
# youtube-dl (or youtube-dlc) executable path
|
# youtube-dl (or youtube-dlc) executable path
|
||||||
YOUTUBE_DL="./yt-dlc/youtube-dlc"
|
YOUTUBE_DL="./yt-dlc/youtube-dlc"
|
||||||
|
|
||||||
# Where the downloaded videos will get stored
|
# Where the downloaded videos will get stored
|
||||||
OUTDIR="./downloads"
|
OUTDIR="./downloads"
|
||||||
|
|
||||||
|
# Where cache files are stored
|
||||||
|
CACHEDIR="./cache"
|
||||||
|
@ -1,14 +1,17 @@
|
|||||||
#!/usr/bin/env sh
|
#!/usr/bin/env sh
|
||||||
|
|
||||||
source "$(dirname $0)/config.sh"
|
source "$(dirname "$0")/config.sh"
|
||||||
|
|
||||||
[ ! -e "$(dirname $0)/$OUTDIR" ] && mkdir -p "$(dirname $0)/$OUTDIR"
|
# Turn paths into absolute ones, if they aren't already. Will be necessary, since we'll change directories later.
|
||||||
|
[ ! "${CACHEDIR::1}" = "/" ] &&
|
||||||
# Turn paths into absolute ones, if they aren't already, as we will change directories later
|
CACHEDIR="$(readlink -f "$(dirname "$0")/$CACHEDIR")"
|
||||||
[ ! "${OUTDIR::1}" = "/" ] &&
|
[ ! "${OUTDIR::1}" = "/" ] &&
|
||||||
OUTDIR="$(readlink -f $(dirname $0)/$OUTDIR)"
|
OUTDIR="$(readlink -f "$(dirname "$0")/$OUTDIR")"
|
||||||
[ ! "${YOUTUBE_DL::1}" = "/" ] &&
|
[ ! "${YOUTUBE_DL::1}" = "/" ] &&
|
||||||
YOUTUBE_DL="$(readlink -f $(dirname $0)/$YOUTUBE_DL)"
|
YOUTUBE_DL="$(readlink -f "$(dirname "$0")/$YOUTUBE_DL")"
|
||||||
|
|
||||||
|
[ ! -e "$OUTDIR" ] && mkdir -p "$OUTDIR"
|
||||||
|
[ ! -e "$CACHEDIR" ] && mkdir -p "$CACHEDIR"
|
||||||
|
|
||||||
p_info() {
|
p_info() {
|
||||||
echo -e "\e[32m>>> $@\e[m"
|
echo -e "\e[32m>>> $@\e[m"
|
||||||
@ -20,19 +23,25 @@ p_error() {
|
|||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo "Usage:"
|
echo "Usage:"
|
||||||
echo " $(basename $0) [OPTIONS] -a - Download all episodes"
|
echo " $(basename "$0") [OPTIONS] -a - Download all episodes"
|
||||||
echo " $(basename $0) [OPTIONS] -s <season> - Download all episodes in the specified season"
|
echo " $(basename "$0") [OPTIONS] -s <season> - Download all episodes of the specified season"
|
||||||
echo " $(basename $0) [OPTIONS] -s <season> -e <episode> - Download the specified episode"
|
echo " $(basename "$0") [OPTIONS] -s <season> -e <episode> - Download the specified episode"
|
||||||
|
echo " $(basename "$0") -h - Show help page"
|
||||||
echo "Options:"
|
echo "Options:"
|
||||||
|
echo " -p - Show progress (default)"
|
||||||
|
echo " -P - Hide progress"
|
||||||
echo " -E - Download episodes in English (default)"
|
echo " -E - Download episodes in English (default)"
|
||||||
echo " -D - Download episodes in German"
|
echo " -D - Download episodes in German"
|
||||||
echo " -p - Show progress"
|
echo " -u - Update episode index (default)"
|
||||||
|
echo " -U - Skip episode index update"
|
||||||
}
|
}
|
||||||
|
|
||||||
unset OPT_SEASON OPT_EPISODE OPT_ALL OPT_EN OPT_LANG OPT_PROGRESS
|
unset OPT_SEASON OPT_EPISODE OPT_ALL OPT_EN OPT_LANG OPT_PROGRESS OPT_UPDATE_INDEX
|
||||||
OPT_LANG="EN"
|
OPT_LANG="EN"
|
||||||
|
OPT_PROGRESS=true
|
||||||
|
OPT_UPDATE_INDEX=true
|
||||||
|
|
||||||
while getopts "haEDps:e:" arg; do
|
while getopts "pPEDuUas:e:h" arg; do
|
||||||
case "$arg" in
|
case "$arg" in
|
||||||
h)
|
h)
|
||||||
usage
|
usage
|
||||||
@ -56,66 +65,71 @@ while getopts "haEDps:e:" arg; do
|
|||||||
p)
|
p)
|
||||||
OPT_PROGRESS=true
|
OPT_PROGRESS=true
|
||||||
;;
|
;;
|
||||||
|
P)
|
||||||
|
unset OPT_PROGRESS
|
||||||
|
echo hi
|
||||||
|
;;
|
||||||
|
u)
|
||||||
|
OPT_UPDATE_INDEX=true
|
||||||
|
;;
|
||||||
|
U)
|
||||||
|
unset OPT_UPDATE_INDEX
|
||||||
|
;;
|
||||||
?)
|
?)
|
||||||
echo "Invalid option: -$OPTARG"
|
|
||||||
usage
|
usage
|
||||||
exit 1
|
exit 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
|
||||||
# Parts of the URL differ depending on the language of the website
|
|
||||||
if [ "$OPT_LANG" = "DE" ]; then
|
if [ "$OPT_LANG" = "DE" ]; then
|
||||||
SEASON_1_URL="https://www.southpark.de/seasons/south-park/yjy8n9/staffel-1"
|
INDEX_FILENAME="$CACHEDIR/_episode_index_DE_"
|
||||||
REGEX_SEASON_URL="\"/seasons/south-park/[0-9a-z]\+/staffel-[0-9]\+\""
|
INDEX_INITIAL_URL="https://www.southpark.de/folgen/940f8z/south-park-cartman-und-die-analsonde-staffel-1-ep-1"
|
||||||
REGEX_EPISODE_URL="\"/folgen/[0-9a-z]\+/south-park-[0-9a-z-]\+-staffel-[0-9]\+-ep-[0-9]\+\""
|
REGEX_EPISODE_URL="\"/folgen/[0-9a-z]\+/south-park-[0-9a-z-]\+-staffel-[0-9]\+-ep-[0-9]\+\""
|
||||||
elif [ "$OPT_LANG" = "EN" ]; then
|
elif [ "$OPT_LANG" = "EN" ]; then
|
||||||
SEASON_1_URL="https://www.southpark.de/en/seasons/south-park/yjy8n9/season-1"
|
INDEX_FILENAME="$CACHEDIR/_episode_index_EN_"
|
||||||
REGEX_SEASON_URL="\"/en/seasons/south-park/[0-9a-z]\+/season-[0-9]\+\""
|
INDEX_INITIAL_URL="https://www.southpark.de/en/episodes/940f8z/south-park-cartman-gets-an-anal-probe-season-1-ep-1"
|
||||||
REGEX_EPISODE_URL="\"/en/episodes/[0-9a-z]\+/south-park-[0-9a-z-]\+-season-[0-9]\+-ep-[0-9]\+\""
|
REGEX_EPISODE_URL="\"/en/episodes/[0-9a-z]\+/south-park-[0-9a-z-]\+-season-[0-9]\+-ep-[0-9]\+\""
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Indexes all season page URLs
|
update_index() {
|
||||||
index_seasons() {
|
[ ! -e "$INDEX_FILENAME" ] && echo "$INDEX_INITIAL_URL" > "$INDEX_FILENAME"
|
||||||
# Get all season URLs by matching the regex
|
echo -ne "\e[32m>>> Updating episode index\e[m"
|
||||||
SEASON_URLS=$(curl -s "$SEASON_1_URL" | grep -o "$REGEX_SEASON_URL" | tr -d "\"" | sed -E "s/^/https:\/\/www.southpark.de/g")
|
while true; do
|
||||||
|
local URL=$(tail -n1 "$INDEX_FILENAME")
|
||||||
|
local NEWURLS=$(curl -s "$URL" | grep -o "$REGEX_EPISODE_URL" | tr -d "\"" | sed -E "s/^/https:\/\/www.southpark.de/g")
|
||||||
|
[ "$URL" = $(printf "$NEWURLS" | tail -n1) ] && break
|
||||||
|
echo "$NEWURLS" >> "$INDEX_FILENAME"
|
||||||
|
echo -ne "\e[32m.\e[m"
|
||||||
|
done
|
||||||
|
# The awk command removes duplicate lines
|
||||||
|
local NEW_INDEX=$(awk '!x[$0]++' "$INDEX_FILENAME")
|
||||||
|
printf "$NEW_INDEX" > "$INDEX_FILENAME"
|
||||||
|
echo
|
||||||
}
|
}
|
||||||
|
|
||||||
# Indexes all episode URLs of the currently indexed season (can only index 1 season at once, for now)
|
# Returns all episode URLs in the specified season
|
||||||
index_episodes() {
|
get_season() {
|
||||||
local SEASON_NUMBER="$1"
|
local SEASON_NUMBER="$1"
|
||||||
get_season_url "$SEASON_NUMBER"
|
grep "\-${SEASON_NUMBER}-ep-[0-9]\+$" "$INDEX_FILENAME"
|
||||||
local SEASON_URL="$RES"
|
|
||||||
EPISODE_URLS=$(curl -s "$SEASON_URL" | grep -o "$REGEX_EPISODE_URL" | tr -d "\"" | sed -E "s/^/https:\/\/www.southpark.de/g")
|
|
||||||
INDEXED_SEASON="$SEASON_NUMBER"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
################
|
# Returns the URL of the specified episode
|
||||||
# All functions named get_<something> store their result in the RES variable.
|
get_episode() {
|
||||||
# We're not using command substitution, because then these functions couldn't set variables.
|
|
||||||
################
|
|
||||||
get_season_url() {
|
|
||||||
local SEASON_NUMBER="$1"
|
|
||||||
[ -z "$SEASON_URLS" ] && index_seasons
|
|
||||||
RES=$(echo "$SEASON_URLS" | grep "\-${SEASON_NUMBER}$")
|
|
||||||
}
|
|
||||||
|
|
||||||
get_episode_url() {
|
|
||||||
local SEASON_NUMBER="$1"
|
local SEASON_NUMBER="$1"
|
||||||
local EPISODE_NUMBER="$2"
|
local EPISODE_NUMBER="$2"
|
||||||
[ ! "$INDEXED_SEASON" = "$SEASON_NUMBER" ] && index_episodes "$SEASON_NUMBER"
|
grep "\-${SEASON_NUMBER}-ep-${EPISODE_NUMBER}$" "$INDEX_FILENAME"
|
||||||
RES=$(echo "$EPISODE_URLS" | grep "ep-${EPISODE_NUMBER}$")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
get_num_seasons() {
|
get_num_seasons() {
|
||||||
[ -z "$SEASON_URLS" ] && index_seasons
|
# Effectively searches, how many "episode 1s" there are in the index
|
||||||
RES=$(echo "$SEASON_URLS" | wc -l)
|
grep "\-[0-9]\+-ep-1$" "$INDEX_FILENAME" | wc -l
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Returns the number of episodes in the specified season
|
||||||
get_num_episodes() {
|
get_num_episodes() {
|
||||||
local SEASON_NUMBER="$1"
|
local SEASON_NUMBER="$1"
|
||||||
[ ! "$INDEXED_SEASON" = "$SEASON_NUMBER" ] && index_episodes "$SEASON_NUMBER"
|
get_season "$SEASON_NUMBER" | wc -l
|
||||||
RES=$(echo "$EPISODE_URLS" | wc -l)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
tmp_cleanup() {
|
tmp_cleanup() {
|
||||||
@ -133,17 +147,32 @@ monitor_progress() {
|
|||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
|
download_interrupt() {
|
||||||
|
p_info "User interrupt received"
|
||||||
|
tmp_cleanup
|
||||||
|
exit 0
|
||||||
|
}
|
||||||
|
|
||||||
|
merge_interrupt() {
|
||||||
|
p_info "User interrupt received"
|
||||||
|
tmp_cleanup
|
||||||
|
p_info "Cleaning up corrupted output file"
|
||||||
|
rm -rf "$1"
|
||||||
|
exit 0
|
||||||
|
}
|
||||||
|
|
||||||
# Takes season and episode number as arguments
|
# Takes season and episode number as arguments
|
||||||
download_episode() {
|
download_episode() {
|
||||||
local SEASON_NUMBER="$1"
|
local SEASON_NUMBER="$1"
|
||||||
local EPISODE_NUMBER="$2"
|
local EPISODE_NUMBER="$2"
|
||||||
get_episode_url "$SEASON_NUMBER" "$EPISODE_NUMBER"
|
|
||||||
local URL="$RES"
|
|
||||||
local OUTFILE="${OUTDIR}/South_Park_S${SEASON_NUMBER}_E${EPISODE_NUMBER}_${OPT_LANG}.mp4"
|
local OUTFILE="${OUTDIR}/South_Park_S${SEASON_NUMBER}_E${EPISODE_NUMBER}_${OPT_LANG}.mp4"
|
||||||
[ -e "$OUTFILE" ] && echo "Already downloaded Season ${SEASON_NUMBER} Episode ${EPISODE_NUMBER}" && return
|
[ -e "$OUTFILE" ] && echo "Already downloaded Season ${SEASON_NUMBER} Episode ${EPISODE_NUMBER}" && return
|
||||||
|
local URL=$(get_episode "$SEASON_NUMBER" "$EPISODE_NUMBER")
|
||||||
|
[ -z "$URL" ] && echo "Unable to download Season ${SEASON_NUMBER} Episode ${EPISODE_NUMBER}; skipping" && return
|
||||||
p_info "Downloading Season $SEASON_NUMBER Episode $EPISODE_NUMBER ($URL)"
|
p_info "Downloading Season $SEASON_NUMBER Episode $EPISODE_NUMBER ($URL)"
|
||||||
|
trap download_interrupt SIGINT
|
||||||
TMPDIR=$(mktemp -d "/tmp/southparkdownloader.XXXXXXXXXX")
|
TMPDIR=$(mktemp -d "/tmp/southparkdownloader.XXXXXXXXXX")
|
||||||
[ -n "OPT_PROGRESS" ] && monitor_progress "$TMPDIR"&
|
[ -n "$OPT_PROGRESS" ] && monitor_progress "$TMPDIR"&
|
||||||
pushd "$TMPDIR" > /dev/null
|
pushd "$TMPDIR" > /dev/null
|
||||||
if ! "$YOUTUBE_DL" "$URL" 2>/dev/null | grep --line-buffered "^\[download\]" | grep -v --line-buffered "^\[download\] Destination:"; then
|
if ! "$YOUTUBE_DL" "$URL" 2>/dev/null | grep --line-buffered "^\[download\]" | grep -v --line-buffered "^\[download\] Destination:"; then
|
||||||
p_info "possible youtube-dl \e[1;31mERROR\e[m"
|
p_info "possible youtube-dl \e[1;31mERROR\e[m"
|
||||||
@ -151,43 +180,42 @@ download_episode() {
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
echo "[download] Merging video files"
|
echo "[download] Merging video files"
|
||||||
# Remove all single quotes from video files, as they cause problems
|
trap "merge_interrupt \"$OUTFILE\"" SIGINT
|
||||||
for i in ./*.mp4; do mv -n "$i" "$(echo $i | tr -d \')"; done
|
# Remove all single quotes and dashes from video files, as they cause problems
|
||||||
|
for i in ./*.mp4; do mv -n "$i" "$(echo $i | tr -d \'-)"; done
|
||||||
# Find all video files and write them into the list
|
# Find all video files and write them into the list
|
||||||
printf "file '%s'\n" ./*.mp4 > list.txt
|
printf "file '%s'\n" ./*.mp4 > list.txt
|
||||||
# Merge video files
|
# Merge video files
|
||||||
ffmpeg -safe 0 -f concat -i "list.txt" -c copy "$OUTFILE" 2>/dev/null
|
ffmpeg -safe 0 -f concat -i "list.txt" -c copy "$OUTFILE" 2>/dev/null
|
||||||
popd > /dev/null
|
popd > /dev/null
|
||||||
|
trap - SIGINT
|
||||||
tmp_cleanup
|
tmp_cleanup
|
||||||
}
|
}
|
||||||
|
|
||||||
# Takes season number as an argument
|
# Takes season number as an argument
|
||||||
download_season() {
|
download_season() {
|
||||||
local SEASON_NUMBER="$1"
|
local SEASON_NUMBER="$1"
|
||||||
get_num_episodes "$SEASON_NUMBER"
|
local NUM_EPISODES=$(get_num_episodes "$SEASON_NUMBER")
|
||||||
local NUM_EPISODES="$RES"
|
|
||||||
for i in $(seq "$NUM_EPISODES"); do
|
for i in $(seq "$NUM_EPISODES"); do
|
||||||
download_episode "$SEASON_NUMBER" "$i"
|
download_episode "$SEASON_NUMBER" "$i"
|
||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
download_all() {
|
download_all() {
|
||||||
get_num_seasons
|
local NUM_SEASONS=$(get_num_seasons)
|
||||||
local NUM_SEASONS="$RES"
|
|
||||||
for i in $(seq "$NUM_SEASONS"); do
|
for i in $(seq "$NUM_SEASONS"); do
|
||||||
download_season "$i"
|
download_season "$i"
|
||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
if [ -n "$OPT_SEASON" ]; then
|
if [ -n "$OPT_SEASON" ]; then
|
||||||
get_season_url "$OPT_SEASON"
|
[ -n "$OPT_UPDATE_INDEX" ] && update_index
|
||||||
[ -z "$RES" ] &&
|
[ -z "$(get_season $OPT_SEASON)" ] &&
|
||||||
p_error "Unable to open Season $OPT_SEASON" &&
|
p_error "Unable to find Season $OPT_SEASON" &&
|
||||||
exit 1
|
exit 1
|
||||||
if [ -n "$OPT_EPISODE" ]; then
|
if [ -n "$OPT_EPISODE" ]; then
|
||||||
get_episode_url "$OPT_SEASON" "$OPT_EPISODE"
|
[ -z "$(get_episode $OPT_SEASON $OPT_EPISODE)" ] &&
|
||||||
[ -z "$RES" ] &&
|
p_error "Unable to find Season $OPT_SEASON Episode $OPT_EPISODE" &&
|
||||||
p_error "Unable to open Season $OPT_SEASON Episode $OPT_EPISODE" &&
|
|
||||||
exit 1
|
exit 1
|
||||||
p_info "Going to download Season $OPT_SEASON Episode $OPT_EPISODE"
|
p_info "Going to download Season $OPT_SEASON Episode $OPT_EPISODE"
|
||||||
download_episode "$OPT_SEASON" "$OPT_EPISODE"
|
download_episode "$OPT_SEASON" "$OPT_EPISODE"
|
||||||
@ -196,6 +224,7 @@ if [ -n "$OPT_SEASON" ]; then
|
|||||||
download_season "$OPT_SEASON"
|
download_season "$OPT_SEASON"
|
||||||
fi
|
fi
|
||||||
elif [ -n "$OPT_ALL" ]; then
|
elif [ -n "$OPT_ALL" ]; then
|
||||||
|
[ -n "$OPT_UPDATE_INDEX" ] && update_index
|
||||||
p_info "Going to download ALL episodes"
|
p_info "Going to download ALL episodes"
|
||||||
download_all
|
download_all
|
||||||
else
|
else
|
||||||
|
Loading…
Reference in New Issue
Block a user