This commit is contained in:
Michael Murtaugh 2015-08-25 12:44:16 +02:00
parent f30aafb5c7
commit d3732a1aee
2 changed files with 6 additions and 6 deletions

View File

@ -7,11 +7,11 @@ from datetime import datetime
from xml.etree import cElementTree as ET from xml.etree import cElementTree as ET
from urllib import urlencode from urllib import urlencode
from urllib2 import urlopen, HTTPError, URLError from urllib2 import urlopen, HTTPError, URLError
# dependencies
import html5lib, jinja2
# local mods # local mods
from trim import trim_removed_spans, contents, set_text_contents, text_contents from et_helpers import trim_removed_spans, contents, set_text_contents, text_contents
from linkify import linkify, urlify, filename_to_padid from linkify import linkify, urlify, filename_to_padid
# external dependencies (use pip to install these)
import html5lib, jinja2
def get_template_env (tpath=None): def get_template_env (tpath=None):
@ -100,7 +100,6 @@ while len(todo) > 0:
os.makedirs(args.path) os.makedirs(args.path)
except OSError: except OSError:
pass pass
# print ("{0}".format(padid).encode("utf-8"), file=sys.stderr)
# _ # _
# _ __ ___ ___| |_ __ _ # _ __ ___ ___| |_ __ _
@ -184,6 +183,9 @@ while len(todo) > 0:
if args.spider: if args.spider:
for l in links: for l in links:
if l not in todo and l not in done: if l not in todo and l not in done:
if l.startswith("http://") or l.startswith("https://"):
print ("Ignoring absolute URL in [[ link ]] form", file=sys.stderr)
continue
# if args.verbose: # if args.verbose:
# print (" link: {0}".format(l), file=sys.stderr) # print (" link: {0}".format(l), file=sys.stderr)
todo.append(l) todo.append(l)
@ -237,8 +239,6 @@ while len(todo) > 0:
a.remove(br); a.insert(1, br) a.remove(br); a.insert(1, br)
img.attrib['src'] = href img.attrib['src'] = href
# extract the style tag (with authorship colors) # extract the style tag (with authorship colors)
style = t.find(".//style") style = t.find(".//style")
if style != None: if style != None: