|
|
@ -101,171 +101,176 @@ while len(todo) > 0: |
|
|
|
except OSError: |
|
|
|
pass |
|
|
|
|
|
|
|
# _ |
|
|
|
# _ __ ___ ___| |_ __ _ |
|
|
|
# | '_ ` _ \ / _ \ __/ _` | |
|
|
|
# | | | | | | __/ || (_| | |
|
|
|
# |_| |_| |_|\___|\__\__,_| |
|
|
|
|
|
|
|
meta_out = "{0}/{1}".format(args.path, urlify(padid, ext=".json")) |
|
|
|
if not args.hidepaths: |
|
|
|
print (meta_out, file=sys.stderr) |
|
|
|
if not args.pretend: |
|
|
|
meta = {} |
|
|
|
meta['padid'] = padid |
|
|
|
revisions_url = apiurl+'getRevisionsCount?'+urlencode(data) |
|
|
|
try: |
|
|
|
|
|
|
|
# _ |
|
|
|
# _ __ ___ ___| |_ __ _ |
|
|
|
# | '_ ` _ \ / _ \ __/ _` | |
|
|
|
# | | | | | | __/ || (_| | |
|
|
|
# |_| |_| |_|\___|\__\__,_| |
|
|
|
|
|
|
|
meta_out = "{0}/{1}".format(args.path, urlify(padid, ext=".json")) |
|
|
|
if not args.hidepaths: |
|
|
|
print (meta_out, file=sys.stderr) |
|
|
|
if not args.pretend: |
|
|
|
meta = {} |
|
|
|
meta['padid'] = padid |
|
|
|
revisions_url = apiurl+'getRevisionsCount?'+urlencode(data) |
|
|
|
if args.showurls: |
|
|
|
print (revisions_url, file=sys.stderr) |
|
|
|
meta['total_revisions'] = json.load(urlopen(revisions_url))['data']['revisions'] |
|
|
|
|
|
|
|
lastedited_url = apiurl+'getLastEdited?'+urlencode(data) |
|
|
|
if args.showurls: |
|
|
|
print (lastedited_url, file=sys.stderr) |
|
|
|
lastedited_raw = json.load(urlopen(lastedited_url))['data']['lastEdited'] |
|
|
|
meta['lastedited_raw'] = lastedited_raw |
|
|
|
meta['lastedited'] = datetime.fromtimestamp(int(lastedited_raw)/1000).isoformat() |
|
|
|
|
|
|
|
# author_ids (unfortunately, this is a list of internal etherpad author ids -- not the names ppl type) |
|
|
|
authors_url = apiurl+'listAuthorsOfPad?'+urlencode(data) |
|
|
|
if args.showurls: |
|
|
|
print (authors_url, file=sys.stderr) |
|
|
|
meta['author_ids'] = json.load(urlopen(authors_url))['data']['authorIDs'] |
|
|
|
|
|
|
|
with open(meta_out, "w") as f: |
|
|
|
json.dump(meta, f) |
|
|
|
|
|
|
|
# _ __ __ ___ __ |
|
|
|
# | '__/ _` \ \ /\ / / |
|
|
|
# | | | (_| |\ V V / |
|
|
|
# |_| \__,_| \_/\_/ |
|
|
|
|
|
|
|
raw_out = "{0}/{1}".format(args.path, urlify(padid, ext=".txt")) |
|
|
|
if not args.hidepaths: |
|
|
|
print (raw_out, file=sys.stderr) |
|
|
|
text_url = apiurl+"getText?"+urlencode(data) |
|
|
|
if args.showurls: |
|
|
|
print (revisions_url, file=sys.stderr) |
|
|
|
meta['total_revisions'] = json.load(urlopen(revisions_url))['data']['revisions'] |
|
|
|
|
|
|
|
lastedited_url = apiurl+'getLastEdited?'+urlencode(data) |
|
|
|
print (text_url, file=sys.stderr) |
|
|
|
if not args.pretend: |
|
|
|
rawText = json.load(urlopen(text_url))['data']['text'] |
|
|
|
with open(raw_out, "w") as f: |
|
|
|
f.write(rawText.encode("utf-8")) |
|
|
|
|
|
|
|
# _ _ _ |
|
|
|
# | |__ | |_ _ __ ___ | | |
|
|
|
# | '_ \| __| '_ ` _ \| | |
|
|
|
# | | | | |_| | | | | | | |
|
|
|
# |_| |_|\__|_| |_| |_|_| |
|
|
|
|
|
|
|
# todo ? -- regular HTML output |
|
|
|
|
|
|
|
# _ |
|
|
|
# ___ ___ | | ___ _ __ ___ |
|
|
|
# / __/ _ \| |/ _ \| '__/ __| |
|
|
|
# | (_| (_) | | (_) | | \__ \ |
|
|
|
# \___\___/|_|\___/|_| |___/ |
|
|
|
|
|
|
|
colors_out = "{0}/{1}".format(args.path, urlify(padid, ext=".html")) |
|
|
|
if not args.hidepaths: |
|
|
|
print (colors_out, file=sys.stderr) |
|
|
|
data['startRev'] = "0" |
|
|
|
colors_url = apiurl+'createDiffHTML?'+urlencode(data) |
|
|
|
if args.showurls: |
|
|
|
print (lastedited_url, file=sys.stderr) |
|
|
|
lastedited_raw = json.load(urlopen(lastedited_url))['data']['lastEdited'] |
|
|
|
meta['lastedited_raw'] = lastedited_raw |
|
|
|
meta['lastedited'] = datetime.fromtimestamp(int(lastedited_raw)/1000).isoformat() |
|
|
|
|
|
|
|
# author_ids (unfortunately, this is a list of internal etherpad author ids -- not the names ppl type) |
|
|
|
authors_url = apiurl+'listAuthorsOfPad?'+urlencode(data) |
|
|
|
if args.showurls: |
|
|
|
print (authors_url, file=sys.stderr) |
|
|
|
meta['author_ids'] = json.load(urlopen(authors_url))['data']['authorIDs'] |
|
|
|
|
|
|
|
with open(meta_out, "w") as f: |
|
|
|
json.dump(meta, f) |
|
|
|
|
|
|
|
# _ __ __ ___ __ |
|
|
|
# | '__/ _` \ \ /\ / / |
|
|
|
# | | | (_| |\ V V / |
|
|
|
# |_| \__,_| \_/\_/ |
|
|
|
|
|
|
|
raw_out = "{0}/{1}".format(args.path, urlify(padid, ext=".txt")) |
|
|
|
if not args.hidepaths: |
|
|
|
print (raw_out, file=sys.stderr) |
|
|
|
text_url = apiurl+"getText?"+urlencode(data) |
|
|
|
if args.showurls: |
|
|
|
print (text_url, file=sys.stderr) |
|
|
|
if not args.pretend: |
|
|
|
rawText = json.load(urlopen(text_url))['data']['text'] |
|
|
|
with open(raw_out, "w") as f: |
|
|
|
f.write(rawText.encode("utf-8")) |
|
|
|
|
|
|
|
# _ _ _ |
|
|
|
# | |__ | |_ _ __ ___ | | |
|
|
|
# | '_ \| __| '_ ` _ \| | |
|
|
|
# | | | | |_| | | | | | | |
|
|
|
# |_| |_|\__|_| |_| |_|_| |
|
|
|
|
|
|
|
# todo ? -- regular HTML output |
|
|
|
|
|
|
|
# _ |
|
|
|
# ___ ___ | | ___ _ __ ___ |
|
|
|
# / __/ _ \| |/ _ \| '__/ __| |
|
|
|
# | (_| (_) | | (_) | | \__ \ |
|
|
|
# \___\___/|_|\___/|_| |___/ |
|
|
|
|
|
|
|
colors_out = "{0}/{1}".format(args.path, urlify(padid, ext=".html")) |
|
|
|
if not args.hidepaths: |
|
|
|
print (colors_out, file=sys.stderr) |
|
|
|
data['startRev'] = "0" |
|
|
|
colors_url = apiurl+'createDiffHTML?'+urlencode(data) |
|
|
|
if args.showurls: |
|
|
|
print (colors_url, file=sys.stderr) |
|
|
|
html = json.load(urlopen(colors_url))['data']['html'] |
|
|
|
t = html5lib.parse(html, namespaceHTMLElements=False) |
|
|
|
trim_removed_spans(t) |
|
|
|
html = ET.tostring(t, method="html") |
|
|
|
|
|
|
|
# Stage 1: Process as text |
|
|
|
# Process [[wikilink]] style links |
|
|
|
# and (optionally) add linked page names to spider todo list |
|
|
|
html, links = linkify(html) |
|
|
|
if args.spider: |
|
|
|
for l in links: |
|
|
|
if l not in todo and l not in done: |
|
|
|
if l.startswith("http://") or l.startswith("https://"): |
|
|
|
print ("Ignoring absolute URL in [[ link ]] form", file=sys.stderr) |
|
|
|
continue |
|
|
|
# if args.verbose: |
|
|
|
# print (" link: {0}".format(l), file=sys.stderr) |
|
|
|
todo.append(l) |
|
|
|
|
|
|
|
# Stage 2: Process as ElementTree |
|
|
|
# |
|
|
|
t = html5lib.parse(html, namespaceHTMLElements=False) |
|
|
|
# apply linkpats |
|
|
|
for a in t.findall(".//a"): |
|
|
|
href = a.attrib.get("href") |
|
|
|
original_href = href |
|
|
|
if href: |
|
|
|
# if args.verbose: |
|
|
|
# print ("searching for PADLINK: {0}".format(href)) |
|
|
|
for pat in padlinkpats: |
|
|
|
if re.search(pat, href) != None: |
|
|
|
# if args.verbose: |
|
|
|
# print (" found PADLINK: {0}".format(href)) |
|
|
|
href = re.sub(pat, "\\1.html", href) |
|
|
|
padid = filename_to_padid(href) |
|
|
|
set_text_contents(a, "[[{0}]]".format(padid)) |
|
|
|
if padid not in todo and padid not in done: |
|
|
|
print (colors_url, file=sys.stderr) |
|
|
|
html = json.load(urlopen(colors_url))['data']['html'] |
|
|
|
t = html5lib.parse(html, namespaceHTMLElements=False) |
|
|
|
trim_removed_spans(t) |
|
|
|
html = ET.tostring(t, method="html") |
|
|
|
|
|
|
|
# Stage 1: Process as text |
|
|
|
# Process [[wikilink]] style links |
|
|
|
# and (optionally) add linked page names to spider todo list |
|
|
|
html, links = linkify(html) |
|
|
|
if args.spider: |
|
|
|
for l in links: |
|
|
|
if l not in todo and l not in done: |
|
|
|
if l.startswith("http://") or l.startswith("https://"): |
|
|
|
if args.verbose: |
|
|
|
print (" link: {0}".format(padid), file=sys.stderr) |
|
|
|
todo.append(padid) |
|
|
|
# apply linkpats |
|
|
|
for s, r in linkpats: |
|
|
|
href = re.sub(s, r, href) |
|
|
|
if href != original_href: |
|
|
|
old_contents = text_contents(a) |
|
|
|
# print ("OLD_CONTENTS {0}".format(old_contents)) |
|
|
|
if old_contents == original_href: |
|
|
|
if args.verbose: |
|
|
|
print (" Updating href IN TEXT", file=sys.stderr) |
|
|
|
set_text_contents(a, href) |
|
|
|
|
|
|
|
if original_href != href: |
|
|
|
if args.verbose: |
|
|
|
print (" Changed href from {0} to {1}".format(original_href, href), file=sys.stderr) |
|
|
|
a.attrib['href'] = href |
|
|
|
|
|
|
|
# SHOWIMAGES : inject img tag for (local) images |
|
|
|
if args.add_images: |
|
|
|
ext = os.path.splitext(href)[1].lower().lstrip(".") |
|
|
|
if ext in ("png", "gif", "jpeg", "jpg"): |
|
|
|
# ap = _parent(a) |
|
|
|
print ("Adding img '{0}'".format(href), file=sys.stderr) |
|
|
|
img = ET.SubElement(a, "img") |
|
|
|
br = ET.SubElement(a, "br") |
|
|
|
a.remove(img); a.insert(0, img) |
|
|
|
a.remove(br); a.insert(1, br) |
|
|
|
img.attrib['src'] = href |
|
|
|
|
|
|
|
# extract the style tag (with authorship colors) |
|
|
|
style = t.find(".//style") |
|
|
|
if style != None: |
|
|
|
style = ET.tostring(style, method="html") |
|
|
|
else: |
|
|
|
style = "" |
|
|
|
# and extract the contents of the body |
|
|
|
html = contents(t.find(".//body")) |
|
|
|
|
|
|
|
if not args.pretend: |
|
|
|
with open(colors_out, "w") as f: |
|
|
|
# f.write(html.encode("utf-8")) |
|
|
|
f.write(colors_template.render( |
|
|
|
html = html, |
|
|
|
style = style, |
|
|
|
revision = meta['total_revisions'], |
|
|
|
padid = padid, |
|
|
|
timestamp = datetime.now() |
|
|
|
).encode("utf-8")) |
|
|
|
|
|
|
|
# _ |
|
|
|
# | | ___ ___ _ __ |
|
|
|
# | |/ _ \ / _ \| '_ \ |
|
|
|
# | | (_) | (_) | |_) | |
|
|
|
# |_|\___/ \___/| .__/ |
|
|
|
# |_| |
|
|
|
|
|
|
|
count += 1 |
|
|
|
if args.limit and count >= args.limit: |
|
|
|
break |
|
|
|
print ("Ignoring absolute URL in [[ link ]] form", file=sys.stderr) |
|
|
|
continue |
|
|
|
# if args.verbose: |
|
|
|
# print (" link: {0}".format(l), file=sys.stderr) |
|
|
|
todo.append(l) |
|
|
|
|
|
|
|
# Stage 2: Process as ElementTree |
|
|
|
# |
|
|
|
t = html5lib.parse(html, namespaceHTMLElements=False) |
|
|
|
# apply linkpats |
|
|
|
for a in t.findall(".//a"): |
|
|
|
href = a.attrib.get("href") |
|
|
|
original_href = href |
|
|
|
if href: |
|
|
|
# if args.verbose: |
|
|
|
# print ("searching for PADLINK: {0}".format(href)) |
|
|
|
for pat in padlinkpats: |
|
|
|
if re.search(pat, href) != None: |
|
|
|
# if args.verbose: |
|
|
|
# print (" found PADLINK: {0}".format(href)) |
|
|
|
href = re.sub(pat, "\\1.html", href) |
|
|
|
padid = filename_to_padid(href) |
|
|
|
set_text_contents(a, "[[{0}]]".format(padid)) |
|
|
|
if padid not in todo and padid not in done: |
|
|
|
if args.verbose: |
|
|
|
print (" link: {0}".format(padid), file=sys.stderr) |
|
|
|
todo.append(padid) |
|
|
|
# apply linkpats |
|
|
|
for s, r in linkpats: |
|
|
|
href = re.sub(s, r, href) |
|
|
|
if href != original_href: |
|
|
|
old_contents = text_contents(a) |
|
|
|
# print ("OLD_CONTENTS {0}".format(old_contents)) |
|
|
|
if old_contents == original_href: |
|
|
|
if args.verbose: |
|
|
|
print (" Updating href IN TEXT", file=sys.stderr) |
|
|
|
set_text_contents(a, href) |
|
|
|
|
|
|
|
if original_href != href: |
|
|
|
if args.verbose: |
|
|
|
print (" Changed href from {0} to {1}".format(original_href, href), file=sys.stderr) |
|
|
|
a.attrib['href'] = href |
|
|
|
|
|
|
|
# SHOWIMAGES : inject img tag for (local) images |
|
|
|
if args.add_images: |
|
|
|
ext = os.path.splitext(href)[1].lower().lstrip(".") |
|
|
|
if ext in ("png", "gif", "jpeg", "jpg"): |
|
|
|
# ap = _parent(a) |
|
|
|
print ("Adding img '{0}'".format(href), file=sys.stderr) |
|
|
|
img = ET.SubElement(a, "img") |
|
|
|
br = ET.SubElement(a, "br") |
|
|
|
a.remove(img); a.insert(0, img) |
|
|
|
a.remove(br); a.insert(1, br) |
|
|
|
img.attrib['src'] = href |
|
|
|
|
|
|
|
# extract the style tag (with authorship colors) |
|
|
|
style = t.find(".//style") |
|
|
|
if style != None: |
|
|
|
style = ET.tostring(style, method="html") |
|
|
|
else: |
|
|
|
style = "" |
|
|
|
# and extract the contents of the body |
|
|
|
html = contents(t.find(".//body")) |
|
|
|
|
|
|
|
if not args.pretend: |
|
|
|
with open(colors_out, "w") as f: |
|
|
|
# f.write(html.encode("utf-8")) |
|
|
|
f.write(colors_template.render( |
|
|
|
html = html, |
|
|
|
style = style, |
|
|
|
revision = meta['total_revisions'], |
|
|
|
padid = padid, |
|
|
|
timestamp = datetime.now() |
|
|
|
).encode("utf-8")) |
|
|
|
|
|
|
|
# _ |
|
|
|
# | | ___ ___ _ __ |
|
|
|
# | |/ _ \ / _ \| '_ \ |
|
|
|
# | | (_) | (_) | |_) | |
|
|
|
# |_|\___/ \___/| .__/ |
|
|
|
# |_| |
|
|
|
|
|
|
|
count += 1 |
|
|
|
if args.limit and count >= args.limit: |
|
|
|
break |
|
|
|
except TypeError: |
|
|
|
print ("ERROR, skipping!", file=sys.stderr) |