From 28e237004f599bc3fda78a9d50482319e8b50741 Mon Sep 17 00:00:00 2001 From: Aki Date: Tue, 24 May 2022 23:38:27 +0200 Subject: Committed changes from tinkering * Documented most of the functions * Divided implementation into more distinct parts * Loosened requirements on system and date regex --- scrap.py | 61 ++++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 23 deletions(-) diff --git a/scrap.py b/scrap.py index 07e4036..769271a 100644 --- a/scrap.py +++ b/scrap.py @@ -9,10 +9,13 @@ import requests OWNER_HREF = re.compile(r"/(?:corporation|alliance)/(\d+)/?") -SYSTEM_AND_DATE = re.compile(r"/(\d+)/(\d+)/$") +SYSTEM_AND_DATE = re.compile(r"/(\d+)/(\d+)/?$") def get_hash(kill): + """ + Looks up and returns hash of the *kill* using Zkillboard's API. + """ response = requests.get("https://zkillboard.com/api/killID/{}/".format(kill)) response.raise_for_status() data = response.json() @@ -22,6 +25,9 @@ def get_hash(kill): class RelatedParser(HTMLParser): + """ + Reads kill IDs and teams from Zkillboard's related kills page. + """ def __init__(self): super().__init__() self._team = 0 @@ -37,7 +43,7 @@ class RelatedParser(HTMLParser): if tag == "tr" and attrs.get("class", "").lower() == "killlistrow" and self._team > 0: self._flush() killid = attrs.get("killid", "") - self._current = (self._team, killid, None) + self._current = (killid, self._team, None) if tag == "a" and self._team > 0 and self._current: match = OWNER_HREF.match(attrs.get("href", "")) @@ -52,42 +58,51 @@ class RelatedParser(HTMLParser): @property def kills(self): + """ + Returns all kills found by the parser along with their team and the ID of the victim. + """ self._flush() return self._kills -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("url") - parser.add_argument("-o", "--output") - args = parser.parse_args() - - response = requests.get(args.url) +def get_related_kills(url): + response = requests.get(url) response.raise_for_status() page = response.text - related = RelatedParser() related.feed(page) - killmails = [] teams = (set(), set()) - for team, kill, owner in related.kills: + for kill, team, victim in related.kills: time.sleep(1.05) # Zkillboard is very sensitive. killmails.append({"id": int(kill), "hash": get_hash(kill)}) destination = teams[team - 1] - destination.add(int(owner)) + destination.add(int(victim)) + return {"killmails": killmails, "teams": list(map(list, teams))} + +def output_name(args): + """ + Generates name of the output file based on the CLI *args*. + """ if args.output: - filename = args.output - else: - match = SYSTEM_AND_DATE.search(args.url) - if match: - filename = "{}_{}.json".format(*match.groups()) - else: - filename = "scrapped.json" - - with open(filename, "w") as file: - file.write(json.dumps({"killmails": killmails, "teams": tuple(map(list, teams))})) + return args.output + match = SYSTEM_AND_DATE.search(args.url) + if match: + return "{}_{}.json".format(*match.groups()) + return "a.json" + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("url") + parser.add_argument("-o", "--output") + args = parser.parse_args() + snapshot = get_related_kills(args.url) + filename = output_name(args) + with open(filename, "w") as fd: + fd.write(json.dumps(snapshot)) + fd.write("\n") if __name__ == "__main__": -- cgit v1.1