summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--scrap.py46
1 files changed, 34 insertions, 12 deletions
diff --git a/scrap.py b/scrap.py
index 0a2bbae..73b4e1d 100644
--- a/scrap.py
+++ b/scrap.py
@@ -3,22 +3,14 @@ import json
import re
import sys
import time
+from html.parser import HTMLParser
import requests
-KILL_HREF = re.compile(r"href=\"/kill/(\d+)/\"")
+KILL_HREF = re.compile(r"/kill/(\d+)/?")
SYSTEM_AND_DATE = re.compile(r"/(\d+)/(\d+)/$")
-def unique_kills_in(page):
- so_far = set()
- for match in KILL_HREF.finditer(page):
- kill = match.group(1)
- if kill not in so_far:
- so_far.add(kill)
- yield kill
-
-
def get_hash(kill):
response = requests.get("https://zkillboard.com/api/killID/{}/".format(kill))
response.raise_for_status()
@@ -28,6 +20,33 @@ def get_hash(kill):
return data[0]["zkb"]["hash"]
+class RelatedParser(HTMLParser):
+ def __init__(self):
+ super().__init__()
+ self._team = 0
+ self._kills = set()
+
+ def handle_starttag(self, tag, attrs):
+ attrs = dict(attrs)
+
+ if tag == "table" and attrs.get("id", "") == "killlist":
+ self._team += 1
+
+ if tag == "a" and self._team > 0:
+ match = KILL_HREF.search(attrs.get("href", ""))
+ if match:
+ kill = (self._team, match.group(1))
+ if kill not in self._kills:
+ self._kills.add(kill)
+
+ def handle_endtag(self, tag):
+ pass
+
+ @property
+ def kills(self):
+ return self._kills
+
+
def main():
parser = argparse.ArgumentParser()
parser.add_argument("url")
@@ -38,10 +57,13 @@ def main():
response.raise_for_status()
page = response.text
+ related = RelatedParser()
+ related.feed(page)
+
output = []
- for kill in unique_kills_in(page):
+ for team, kill in related.kills:
time.sleep(1.05) # Zkillboard is very sensitive.
- output.append({"id": kill, "hash": get_hash(kill)})
+ output.append({"id": kill, "hash": get_hash(kill), "team": team})
if args.output:
filename = args.output