diff options
-rw-r--r-- | scrap.py | 46 |
1 files changed, 34 insertions, 12 deletions
@@ -3,22 +3,14 @@ import json import re import sys import time +from html.parser import HTMLParser import requests -KILL_HREF = re.compile(r"href=\"/kill/(\d+)/\"") +KILL_HREF = re.compile(r"/kill/(\d+)/?") SYSTEM_AND_DATE = re.compile(r"/(\d+)/(\d+)/$") -def unique_kills_in(page): - so_far = set() - for match in KILL_HREF.finditer(page): - kill = match.group(1) - if kill not in so_far: - so_far.add(kill) - yield kill - - def get_hash(kill): response = requests.get("https://zkillboard.com/api/killID/{}/".format(kill)) response.raise_for_status() @@ -28,6 +20,33 @@ def get_hash(kill): return data[0]["zkb"]["hash"] +class RelatedParser(HTMLParser): + def __init__(self): + super().__init__() + self._team = 0 + self._kills = set() + + def handle_starttag(self, tag, attrs): + attrs = dict(attrs) + + if tag == "table" and attrs.get("id", "") == "killlist": + self._team += 1 + + if tag == "a" and self._team > 0: + match = KILL_HREF.search(attrs.get("href", "")) + if match: + kill = (self._team, match.group(1)) + if kill not in self._kills: + self._kills.add(kill) + + def handle_endtag(self, tag): + pass + + @property + def kills(self): + return self._kills + + def main(): parser = argparse.ArgumentParser() parser.add_argument("url") @@ -38,10 +57,13 @@ def main(): response.raise_for_status() page = response.text + related = RelatedParser() + related.feed(page) + output = [] - for kill in unique_kills_in(page): + for team, kill in related.kills: time.sleep(1.05) # Zkillboard is very sensitive. - output.append({"id": kill, "hash": get_hash(kill)}) + output.append({"id": kill, "hash": get_hash(kill), "team": team}) if args.output: filename = args.output |