]> gitweb.fluxo.info Git - httruta.git/commitdiff
Initial import
authorSilvio Rhatto <rhatto@riseup.net>
Sun, 25 Aug 2013 19:08:20 +0000 (16:08 -0300)
committerSilvio Rhatto <rhatto@riseup.net>
Sun, 25 Aug 2013 19:08:20 +0000 (16:08 -0300)
README.mdwn [new file with mode: 0644]
scuttle.sh [new file with mode: 0644]

diff --git a/README.mdwn b/README.mdwn
new file mode 100644 (file)
index 0000000..b5aff85
--- /dev/null
@@ -0,0 +1,7 @@
+Feed Crawler
+============
+
+Download all links from a feed using httrack.
+
+    curl $URL > urls.txt
+    httrack -r$LEVEL -e%$EXT_LEVEL -m$FILESIZE -Y -%L urls.txt
diff --git a/scuttle.sh b/scuttle.sh
new file mode 100644 (file)
index 0000000..d41405f
--- /dev/null
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+BASEDIR=/var/sites/links
+SCUTTLEDIR=`basename $( find ${BASEDIR} -maxdepth 1 -iname "SemanticScuttle-*" | head -n 1 )`
+CONFIGFILE=${BASEDIR}/${SCUTTLEDIR}/data/config.php
+MIRRORDIR=${BASEDIR}/mirrors
+TMPDIR=/tmp
+
+getconf() {
+  grep ${1} ${CONFIGFILE} | sed -e s/\[^\'\]\*\'// -e s/\'\.\*\$//
+}
+
+dbuser=`getconf dbuser`
+dbpass=`getconf dbpass`
+dbname=`getconf dbname`
+dbhost=`getconf dbhost`
+
+sqlquery() {
+  mysql --skip-column-names --batch \
+        --user=${dbuser}          \
+        --password=${dbpass}      \
+        --database=${dbname}      \
+        --host=${dbhost}          \
+        --execute="${1}"
+}
+
+# grabs URLs from db
+tmpfile=`mktemp -p ${TMPDIR}`
+chown links.links ${tmpfile}
+chmod 600 ${tmpfile}
+sqlquery "select bAddress from sc_bookmarks;" > ${tmpfile}
+
+# creates target dir
+year=`date +%Y`
+month=`date +%m`
+%day=`date +%d`
+TARGETDIR=${MIRRORDIR}/${year}/${month}
+sudo -u links mkdir -p ${TARGETDIR}
+
+# grabs URLs from the network
+httrack --verbose           \
+        --user links        \
+        --depth=1           \
+        --purge-old=0       \
+        --index             \
+        --cookies=1         \
+        --list ${tmpfile}   \
+        --path ${TARGETDIR} \