Move pad pulling into own function

This commit is contained in:
Luke Murphy 2020-01-19 00:03:00 +01:00
parent 97bcca145b
commit b82f39a42d
No known key found for this signature in database
GPG Key ID: 5E2EF5A63E3718CC

View File

@ -163,19 +163,7 @@ def build_argument_parser(args):
return parser return parser
def get_padids(args, info, data):
def main(args):
p = build_argument_parser(args)
args = p.parse_args(args)
raw_ext = ".raw.txt"
if args.no_raw_ext:
raw_ext = ""
info = loadpadinfo(args.padinfo)
data = {}
data['apikey'] = info['apikey']
if args.padid: if args.padid:
padids = args.padid padids = args.padid
elif args.glob: elif args.glob:
@ -188,258 +176,208 @@ def main(args):
info['localapiurl'] + 'listAllPads?' + urlencode(data) info['localapiurl'] + 'listAllPads?' + urlencode(data)
)['data']['padIDs'] )['data']['padIDs']
padids.sort() padids.sort()
numpads = len(padids) return padids
# maxmsglen = 0
count = 0
progress_kwargs = {}
if not istty():
progress_kwargs.update(dict(disable=True))
progress_pads = tqdm(iterable=padids, total=len(padids), **progress_kwargs)
for i, padid in enumerate(progress_pads): def handle_pad(args, index, padid, data, info, raw_ext):
if args.skip != None and i < args.skip: if args.skip != None and index < args.skip:
continue return
data['padID'] = padid data['padID'] = padid
p = padpath(padid, args.pub, args.group, args.fix_names) p = padpath(padid, args.pub, args.group, args.fix_names)
if args.folder: if args.folder:
p = os.path.join(p, padid) p = os.path.join(p, padid)
metapath = p + ".meta.json" metapath = p + ".meta.json"
revisions = None revisions = None
tries = 1 tries = 1
skip = False skip = False
padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"]) padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"])
meta = {} meta = {}
while True: while True:
try: try:
if os.path.exists(metapath): if os.path.exists(metapath):
with open(metapath) as f: with open(metapath) as f:
meta.update(json.load(f)) meta.update(json.load(f))
revisions = getjson( revisions = getjson(
info['localapiurl'] info['localapiurl'] + 'getRevisionsCount?' + urlencode(data)
+ 'getRevisionsCount?' )['data']['revisions']
+ urlencode(data) if meta['revisions'] == revisions and not args.force:
)['data']['revisions']
if meta['revisions'] == revisions and not args.force:
skip = True
break
meta['padid'] = padid
versions = meta["versions"] = []
versions.append(
{
"url": padurlbase + quote(padid),
"type": "pad",
"code": 200,
}
)
if revisions == None:
meta['revisions'] = getjson(
info['localapiurl']
+ 'getRevisionsCount?'
+ urlencode(data)
)['data']['revisions']
else:
meta['revisions'] = revisions
if (meta['revisions'] == 0) and (not args.zerorevs):
skip = True skip = True
break break
# todo: load more metadata! meta['padid'] = padid
meta['group'], meta['pad'] = splitpadname(padid) versions = meta["versions"] = []
meta['pathbase'] = p versions.append(
meta['lastedited_raw'] = int( {"url": padurlbase + quote(padid), "type": "pad", "code": 200,}
getjson( )
info['localapiurl'] + 'getLastEdited?' + urlencode(data)
)['data']['lastEdited'] if revisions is None:
) meta['revisions'] = getjson(
meta['lastedited_iso'] = datetime.fromtimestamp( info['localapiurl'] + 'getRevisionsCount?' + urlencode(data)
int(meta['lastedited_raw']) / 1000 )['data']['revisions']
).isoformat() else:
meta['author_ids'] = getjson( meta['revisions'] = revisions
info['localapiurl'] + 'listAuthorsOfPad?' + urlencode(data)
)['data']['authorIDs'] if (meta['revisions'] == 0) and (not args.zerorevs):
skip = True
break break
except HTTPError as e:
tries += 1 # todo: load more metadata!
if tries > 3: meta['group'], meta['pad'] = splitpadname(padid)
print( meta['pathbase'] = p
"Too many failures ({0}), skipping".format(padid), meta['lastedited_raw'] = int(
file=sys.stderr, getjson(
) info['localapiurl'] + 'getLastEdited?' + urlencode(data)
skip = True )['data']['lastEdited']
break )
else: meta['lastedited_iso'] = datetime.fromtimestamp(
sleep(3) int(meta['lastedited_raw']) / 1000
except TypeError as e: ).isoformat()
meta['author_ids'] = getjson(
info['localapiurl'] + 'listAuthorsOfPad?' + urlencode(data)
)['data']['authorIDs']
break
except HTTPError as e:
tries += 1
if tries > 3:
print( print(
"Type Error loading pad {0} (phantom pad?), skipping".format( "Too many failures ({0}), skipping".format(padid),
padid
),
file=sys.stderr, file=sys.stderr,
) )
skip = True skip = True
break break
else:
sleep(3)
except TypeError as e:
print(
"Type Error loading pad {0} (phantom pad?), skipping".format(
padid
),
file=sys.stderr,
)
skip = True
break
if skip: if skip:
continue return
count += 1 if args.output:
print(padid)
if args.output: if args.all or (args.meta or args.text or args.html or args.dhtml):
print(padid) try:
os.makedirs(os.path.split(metapath)[0])
except OSError:
pass
if args.all or (args.meta or args.text or args.html or args.dhtml): if args.all or args.text:
try: text = getjson(info['localapiurl'] + 'getText?' + urlencode(data))
os.makedirs(os.path.split(metapath)[0]) ver = {"type": "text"}
except OSError: versions.append(ver)
pass ver["code"] = text["_code"]
if text["_code"] == 200:
text = text['data']['text']
if args.all or args.text: ##########################################
text = getjson(info['localapiurl'] + 'getText?' + urlencode(data)) ## ENFORCE __NOPUBLISH__ MAGIC WORD
ver = {"type": "text"} ##########################################
versions.append(ver) if args.nopublish and args.nopublish in text:
ver["code"] = text["_code"] # NEED TO PURGE ANY EXISTING DOCS
if text["_code"] == 200: try_deleting(
text = text['data']['text'] (
p + raw_ext,
########################################## p + ".raw.html",
## ENFORCE __NOPUBLISH__ MAGIC WORD p + ".diff.html",
########################################## p + ".meta.json",
if args.nopublish and args.nopublish in text:
# NEED TO PURGE ANY EXISTING DOCS
try_deleting(
(
p + raw_ext,
p + ".raw.html",
p + ".diff.html",
p + ".meta.json",
)
) )
continue )
return
########################################## ##########################################
## ENFORCE __PUBLISH__ MAGIC WORD ## ENFORCE __PUBLISH__ MAGIC WORD
########################################## ##########################################
if args.publish_opt_in and args.publish not in text: if args.publish_opt_in and args.publish not in text:
try_deleting( try_deleting(
( (
p + raw_ext, p + raw_ext,
p + ".raw.html", p + ".raw.html",
p + ".diff.html", p + ".diff.html",
p + ".meta.json", p + ".meta.json",
)
) )
continue )
return
ver["path"] = p + raw_ext ver["path"] = p + raw_ext
ver["url"] = quote(ver["path"]) ver["url"] = quote(ver["path"])
with open(ver["path"], "w") as f: with open(ver["path"], "w") as f:
f.write(text) f.write(text)
# once the content is settled, compute a hash # once the content is settled, compute a hash
# and link it in the metadata! # and link it in the metadata!
links = [] links = []
if args.css: if args.css:
links.append({"href": args.css, "rel": "stylesheet"}) links.append({"href": args.css, "rel": "stylesheet"})
# todo, make this process reflect which files actually were made # todo, make this process reflect which files actually were made
versionbaseurl = quote(padid) versionbaseurl = quote(padid)
links.append(
{
"href": versions[0]["url"],
"rel": "alternate",
"type": "text/html",
"title": "Etherpad",
}
)
if args.all or args.text:
links.append( links.append(
{ {
"href": versions[0]["url"], "href": versionbaseurl + raw_ext,
"rel": "alternate", "rel": "alternate",
"type": "text/html", "type": "text/plain",
"title": "Etherpad", "title": "Plain text",
}
)
if args.all or args.html:
links.append(
{
"href": versionbaseurl + ".raw.html",
"rel": "alternate",
"type": "text/html",
"title": "HTML",
}
)
if args.all or args.dhtml:
links.append(
{
"href": versionbaseurl + ".diff.html",
"rel": "alternate",
"type": "text/html",
"title": "HTML with author colors",
}
)
if args.all or args.meta:
links.append(
{
"href": versionbaseurl + ".meta.json",
"rel": "alternate",
"type": "application/json",
"title": "Meta data",
} }
) )
if args.all or args.text:
links.append(
{
"href": versionbaseurl + raw_ext,
"rel": "alternate",
"type": "text/plain",
"title": "Plain text",
}
)
if args.all or args.html:
links.append(
{
"href": versionbaseurl + ".raw.html",
"rel": "alternate",
"type": "text/html",
"title": "HTML",
}
)
if args.all or args.dhtml:
links.append(
{
"href": versionbaseurl + ".diff.html",
"rel": "alternate",
"type": "text/html",
"title": "HTML with author colors",
}
)
if args.all or args.meta:
links.append(
{
"href": versionbaseurl + ".meta.json",
"rel": "alternate",
"type": "application/json",
"title": "Meta data",
}
)
# links.append({"href":"/", "rel":"search", "type":"text/html", "title":"Index"}) if args.all or args.dhtml:
data['startRev'] = "0"
if args.all or args.dhtml: html = getjson(
data['startRev'] = "0" info['localapiurl'] + 'createDiffHTML?' + urlencode(data)
html = getjson( )
info['localapiurl'] + 'createDiffHTML?' + urlencode(data) ver = {"type": "diffhtml"}
) versions.append(ver)
ver = {"type": "diffhtml"} ver["code"] = html["_code"]
versions.append(ver) if html["_code"] == 200:
ver["code"] = html["_code"] try:
if html["_code"] == 200:
try:
html = html['data']['html']
ver["path"] = p + ".diff.html"
ver["url"] = quote(ver["path"])
# doc = html5lib.parse(html, treebuilder="etree", override_encoding="utf-8", namespaceHTMLElements=False)
doc = html5lib.parse(
html, treebuilder="etree", namespaceHTMLElements=False
)
html5tidy(
doc,
indent=True,
title=padid,
scripts=args.script,
links=links,
)
with open(ver["path"], "w") as f:
print(
ET.tostring(doc, method="html", encoding="unicode"),
file=f,
)
except TypeError:
# Malformed / incomplete response, record the message (such as "internal error") in the metadata and write NO file!
ver["message"] = html["message"]
# with open(ver["path"], "w") as f:
# print ("""<pre>{0}</pre>""".format(json.dumps(html, indent=2)), file=f)
# Process text, html, dhtml, all options
if args.all or args.html:
html = getjson(info['localapiurl'] + 'getHTML?' + urlencode(data))
ver = {"type": "html"}
versions.append(ver)
ver["code"] = html["_code"]
if html["_code"] == 200:
html = html['data']['html'] html = html['data']['html']
ver["path"] = p + ".raw.html" ver["path"] = p + ".diff.html"
ver["url"] = quote(ver["path"]) ver["url"] = quote(ver["path"])
doc = html5lib.parse( doc = html5lib.parse(
html, treebuilder="etree", namespaceHTMLElements=False html, treebuilder="etree", namespaceHTMLElements=False
@ -456,12 +394,58 @@ def main(args):
ET.tostring(doc, method="html", encoding="unicode"), ET.tostring(doc, method="html", encoding="unicode"),
file=f, file=f,
) )
except TypeError:
ver["message"] = html["message"]
# output meta # Process text, html, dhtml, all options
if args.all or args.meta: if args.all or args.html:
ver = {"type": "meta"} html = getjson(info['localapiurl'] + 'getHTML?' + urlencode(data))
versions.append(ver) ver = {"type": "html"}
ver["path"] = metapath versions.append(ver)
ver["url"] = quote(metapath) ver["code"] = html["_code"]
with open(metapath, "w") as f: if html["_code"] == 200:
json.dump(meta, f, indent=2) html = html['data']['html']
ver["path"] = p + ".raw.html"
ver["url"] = quote(ver["path"])
doc = html5lib.parse(
html, treebuilder="etree", namespaceHTMLElements=False
)
html5tidy(
doc, indent=True, title=padid, scripts=args.script, links=links,
)
with open(ver["path"], "w") as f:
print(
ET.tostring(doc, method="html", encoding="unicode"), file=f,
)
# output meta
if args.all or args.meta:
ver = {"type": "meta"}
versions.append(ver)
ver["path"] = metapath
ver["url"] = quote(metapath)
with open(metapath, "w") as f:
json.dump(meta, f, indent=2)
def main(args):
p = build_argument_parser(args)
args = p.parse_args(args)
raw_ext = ".raw.txt"
if args.no_raw_ext:
raw_ext = ""
info = loadpadinfo(args.padinfo)
data = {}
data['apikey'] = info['apikey']
padids = get_padids(args, info, data)
progress_kwargs = {}
if not istty():
progress_kwargs.update(dict(disable=True))
progress_pads = tqdm(iterable=padids, total=len(padids), **progress_kwargs)
for index, padid in enumerate(progress_pads):
handle_pad(args, index, padid, data, info, raw_ext)