blob: 767898d9cd8671405293302ff44e36aff6f75deb (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
|
#!/bin/sh
#
# Copyright (c) 2015-2021 V.Krishn
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the Simplified BSD License (also
# known as the "2-Clause License" or "FreeBSD License".)
#
# This program is distributed in the hope that it will be useful,
# but without any warranty; without even the implied warranty of
# merchantability or fitness for a particular purpose.
#
# Author contact information:
# vkrishn@insteps.net
# http://www.insteps.net
#
# *******************************************************************
#
# Code to fetch and store newsbeuter's feeds/rss/xml data, based on:
# urls (rss/atom)
# tags/catogory: eg. '/news/bbc.co.uk'
# dbname: eg. news, business, dev
#
#
#
fpath=$(readlink -f $0)
if [ ! "$APPDIR" ]; then APPDIR=$(dirname $(dirname $fpath)); fi
PWD=$(pwd)
source $APPDIR/scripts/env.sh
. $SCRIPTDIR/date.inc
if [ ! -f "$CONFIGDIR/urls.db" ]; then
echo 'Run config/setup first.'
exit 0
fi
# make rss/xml file vcs friendly, i.e for git/hg/fossil..etc
# i.e non-blob like
sanitize_xml() {
local xml="$1"
#local xml="$FEEDSDIR/.current.xml"
# sed -i 's/</\n</g' $xml # newline in title issue
sed -i 's/<\w/\n&/g' $xml
sed -i -e 's/<div/\n&/g' \
-e 's/<table/\n&/g' \
-e 's/<p/\n&/g' \
-e 's/<ul/\n&/g' \
-e 's/<li/\n&/g' \
-e 's/<a/\n&/g' \
-e 's/<br/\n&/g' \
$xml;
sed -i '/^[\t\s]*$/d' $xml
sed -i '/^ *$/d' $xml
sed -i '/^$/d' $xml
sed -i 's/\r$//' $xml;
}
sanitize_xml_feeds() {
cd $FEEDSDIR
local oldd=$(pwd);
find ./ -name *.xml | while read f; do sanitize_xml $f; done;
}
is_xml() {
xmlType='';
if [ ! -f "$1" ]; then return 1; fi
# dependencies=file, need another option if possible
if [ -n "$(grep '^<rss version' $1)" ]; then return 0; fi
# get mime/encoding
local mime=$(file --mime-type --mime-encoding $1)
local charset=${mime##*=}
mime=${mime##*/}; mime=${mime%%;*}
#if [ ! "$charset" = 'utf-8' ]; then return 1; fi # no check for now
# test for .xml file
case $mime in
xml) xmlType=$mime; return 0 ;;
esac
return 1;
}
## Example urls
# URL="http://news.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml"
# URL="http://www.rediff.com/rss/inrss.xml"
fetch_url() {
URL=$1; xmlType=''; localXml='.current.xml';
if [ -n "$URL" ]; then
URLSUM=$(echo $URL | sha1sum -t | cut -b 1-40 -)
# URLSUM=${URLSUM:0:40}
else
return 0
fi
echo "$URLSUM -> $URL"
if [ -d "$FEEDSDIR" ]; then
cd $FEEDSDIR
else
echo 'Feeds dir missing, run config/setup first.'; exit 0;
fi
if [ -f '.feeds' ]; then
a=$(echo $URLSUM | cut -b 1 -)
b=$(echo $URLSUM | cut -b 1-2 -)
mkdir -p "$a/$b"
if [ -f "$localXml" ]; then rm -f "$localXml"; fi
mkdir -p $LOGDIR
local logfile="$LOGDIR/$MONTHLY-$DAY.log"
if [ $USECURL = '1' ]; then
curl $CURLOPTS_1 --user-agent "$_USERAGENT_0" "$URL" -o "$localXml" -v --stderr - >> "$logfile"
else
wget $WGETOPTS_1 --user-agent="'$_USERAGENT_0'" "$URL" -O "$localXml" -a $logfile
fi
if [ -s "$localXml" ]; then
# checkpoints
# 1. make git/fossil friendly
sanitize_xml "$FEEDSDIR/$localXml"
# 2. checksize (<2mb) - TODO
# 3. check for xml
if is_xml $localXml; then
mv -f "$localXml" "$a/$b/$URLSUM.xml"
echo $EPOCH > .lastfetch
else
echo -e ${cRED}'msg: not a xml/rss document'${cNORMAL};
fi
else
echo $URLSUM >> ${missedlst}.tmp
echo -e ${cRED}'msg: url unreachable'${cNORMAL};
fi
else
echo "Incorrect feeds dir"
exit 0
fi
}
_fetch_querylist() {
local s=$1; local epoch=$2;
if [ ! "$s" ]; then
printf "${cRED}Nothing done ! (no record found)${cNORMAL}\n";
return;
fi
local extfetch="$RUNDIR/fetch/$epoch";
echo "$s" > "$extfetch"
while read url; do
cd $APPDIR;
fetch_url $url;
done < $extfetch
cat $extfetch >> "$extfetch.done"
rm -f $extfetch
}
fetch_by_url() {
url=$1; if [ "$url" = "" ]; then exit 0; fi;
epoch=$2; if [ "$epoch" = "" ]; then exit 0; fi;
printf "${cBWHITE}fetch::by-url ->${cNORMAL} $url\n";
local query='SELECT rssurl FROM rss_url WHERE rssurl='"'$url';";
local s=$(printf "$query" | sqlite3 "$CONFIGDIR/urls.db");
_fetch_querylist "$s" $epoch
}
fetch_by_tag() {
tag=$1; if [ "$tag" = "" ]; then exit 0; fi;
epoch=$2; if [ "$epoch" = "" ]; then exit 0; fi;
printf "${cBWHITE}fetch::by-tag ->${cNORMAL} $tag\n";
local query='SELECT rssurl FROM rss_url WHERE tags='"'$tag';";
local s=$(printf "$query" | sqlite3 "$CONFIGDIR/urls.db");
_fetch_querylist "$s" $epoch
}
fetch_by_tagfolder() {
tag=$1; if [ "$tag" = "" ]; then exit 0; fi;
epoch=$2; if [ "$epoch" = "" ]; then exit 0; fi;
printf "${cBWHITE}fetch::by-tagfolder ->${cNORMAL} $tag\n";
local query='SELECT rssurl FROM rss_url WHERE tags LIKE '"'$tag/%'"";";
local s=$(echo "$query" | sqlite3 "$CONFIGDIR/urls.db");
_fetch_querylist "$s" $epoch
}
fetch_by_dbname() {
dbname=$1; if [ "$dbname" = "" ]; then exit 0; fi;
if [ ! -f "$URLDIR/$dbname" ]; then echo 'Run config/setup first.'; exit 0; fi
printf "${cBWHITE}fetch::by-db ->${cNORMAL} $dbname\n";
local query='SELECT rssurl FROM rss_url WHERE dbname='"'$dbname';";
local s=$(printf "$query" | sqlite3 "$CONFIGDIR/urls.db");
for url in $s; do
cd $APPDIR;
fetch_url $url;
done
}
fetch_all() {
echo '--'
}
fetch() {
echo '--'
}
## examples
# fetch_by_dbname business
# fetch_by_tag '/business/crmbuyer.com' > /dev/null 2>&1
# fetch_by_url 'http://www.crmbuyer.com/perl/syndication/rssfull.pl'
# fetch_url http://distrowatch.com/news/dwp.xml $EPOCH
|