import argparse import json import re import sys import time from html.parser import HTMLParser import requests KILL_HREF = re.compile(r"/kill/(\d+)/?") SYSTEM_AND_DATE = re.compile(r"/(\d+)/(\d+)/$") def get_hash(kill): response = requests.get("https://zkillboard.com/api/killID/{}/".format(kill)) response.raise_for_status() data = response.json() if len(data) > 1: raise ValueError() return data[0]["zkb"]["hash"] class RelatedParser(HTMLParser): def __init__(self): super().__init__() self._team = 0 self._kills = set() def handle_starttag(self, tag, attrs): attrs = dict(attrs) if tag == "table" and attrs.get("id", "") == "killlist": self._team += 1 if tag == "a" and self._team > 0: match = KILL_HREF.search(attrs.get("href", "")) if match: kill = (self._team, match.group(1)) if kill not in self._kills: self._kills.add(kill) @property def kills(self): return self._kills def main(): parser = argparse.ArgumentParser() parser.add_argument("url") parser.add_argument("-o", "--output") args = parser.parse_args() response = requests.get(args.url) response.raise_for_status() page = response.text related = RelatedParser() related.feed(page) output = [] for team, kill in related.kills: time.sleep(1.05) # Zkillboard is very sensitive. output.append({"id": kill, "hash": get_hash(kill), "team": team}) if args.output: filename = args.output else: match = SYSTEM_AND_DATE.search(args.url) if match: filename = "{}_{}.json".format(*match.groups()) else: filename = "scrapped.json" with open(filename, "w") as file: file.write(json.dumps(output)) if __name__ == "__main__": main()