Download all links from a feed using httrack. This is the engine behind the
"Cache" feature used by https://links.sarava.org Semantic Scuttle instance.
+Usage
+-----
+
+Place this script somewhere and setup a cronjob like this:
+
+`*/5 * * * * /var/sites/arquivo/httracker/httracker &> /dev/null`
+
TODO
----
- Include all sites already donwloaded by scuttler.
- Support for other fetchers like youtube-dl.
+- Lockfile support.
TMP="/var/sites/arquivo/tmp/httracker"
URLS="$TMP/urls-httracker.txt"
URLS_SCUTTLER="$TMP/urls-scuttler.txt"
-LEVEL="1"
-EXT_LEVEL="1"
FILESIZE=""
USER="arquivo"
GROUP="arquivo"
-DEPTH="1"
+DEPTH="2"
+EXT_DEPTH="1"
fi
# Get each URL
- httrack \
- --mirror \
- --continue \
- --depth=${DEPTH} \
- --near \
- --purge-old=0 \
- --index \
- --cookies=1 \
- --path ${target} \
- -r${LEVEL} ${OPTS} ${url}
- #-e%${EXT_LEVEL} \
- #-m$FILESIZE \
- #--verbose
+ httrack \
+ --mirror \
+ --continue \
+ --depth=${DEPTH} \
+ --ext-depth ${EXT_DEPTH} \
+ --near \
+ --purge-old=0 \
+ --index \
+ --cookies=1 \
+ --path ${target} \
+ ${OPTS} ${url}
if [ "$?" == "0" ]; then
# Mark as downloaded
fi
# Grabs URLs from the network
- httrack --verbose \
- --mirror \
- --continue \
- --user links \
- --depth=${DEPTH} \
- --near \
- --purge-old=0 \
- --index \
- --cookies=1 \
- --list ${URLS} \
+ httrack \
+ --mirror \
+ --continue \
+ --depth=${DEPTH} \
+ --ext-depth ${EXT_DEPTH} \
+ --near \
+ --purge-old=0 \
+ --index \
+ --cookies=1 \
+ --user links \
+ --list ${URLS} \
--path ${target} ${OPTS}
}