import argparse import json import re import sys import time from html.parser import HTMLParser import requests OWNER_HREF = re.compile(r"/(?:corporation|alliance)/(\d+)/?") SYSTEM_AND_DATE = re.compile(r"/(\d+)/(\d+)/$") def get_hash(kill): response = requests.get("https://zkillboard.com/api/killID/{}/".format(kill)) response.raise_for_status() data = response.json() if len(data) > 1: raise ValueError() return data[0]["zkb"]["hash"] class RelatedParser(HTMLParser): def __init__(self): super().__init__() self._team = 0 self._kills = set() self._current = None def handle_starttag(self, tag, attrs): attrs = dict(attrs) if tag == "table" and attrs.get("id", "").lower() == "killlist": self._team += 1 if tag == "tr" and attrs.get("class", "").lower() == "killlistrow" and self._team > 0: self._flush() killid = attrs.get("killid", "") self._current = (self._team, killid, None) if tag == "a" and self._team > 0 and self._current: match = OWNER_HREF.match(attrs.get("href", "")) if match: self._current = (*self._current[:2], match.group(1)) self._flush() def _flush(self): if self._current and all(self._current): self._kills.add(self._current) self._current = None @property def kills(self): self._flush() return self._kills def main(): parser = argparse.ArgumentParser() parser.add_argument("url") parser.add_argument("-o", "--output") args = parser.parse_args() response = requests.get(args.url) response.raise_for_status() page = response.text related = RelatedParser() related.feed(page) killmails = [] teams = ([], []) for team, kill, owner in related.kills: time.sleep(1.05) # Zkillboard is very sensitive. killmails.append({"id": int(kill), "hash": get_hash(kill)}) destination = teams[team - 1] if owner not in destination: destination.append(int(owner)) if args.output: filename = args.output else: match = SYSTEM_AND_DATE.search(args.url) if match: filename = "{}_{}.json".format(*match.groups()) else: filename = "scrapped.json" with open(filename, "w") as file: file.write(json.dumps({"killmails": killmails, "teams": teams})) if __name__ == "__main__": main()