From a984f8887cf42d3819bd713a33ca562d4711ef6f Mon Sep 17 00:00:00 2001 From: Aki Date: Tue, 27 Apr 2021 00:18:58 +0200 Subject: Adjusted scrapper to new battle model --- scrap.py | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/scrap.py b/scrap.py index e6058bd..c28b1c0 100644 --- a/scrap.py +++ b/scrap.py @@ -7,7 +7,8 @@ from html.parser import HTMLParser import requests -KILL_HREF = re.compile(r"/kill/(\d+)/?") + +OWNER_HREF = re.compile(r"/(?:corporation|alliance)/(\d+)/?") SYSTEM_AND_DATE = re.compile(r"/(\d+)/(\d+)/$") @@ -25,22 +26,33 @@ class RelatedParser(HTMLParser): super().__init__() self._team = 0 self._kills = set() + self._current = None def handle_starttag(self, tag, attrs): attrs = dict(attrs) - if tag == "table" and attrs.get("id", "") == "killlist": + if tag == "table" and attrs.get("id", "").lower() == "killlist": self._team += 1 - if tag == "a" and self._team > 0: - match = KILL_HREF.search(attrs.get("href", "")) + if tag == "tr" and attrs.get("class", "").lower() == "killlistrow" and self._team > 0: + self._flush() + killid = attrs.get("killid", "") + self._current = (self._team, killid, None) + + if tag == "a" and self._team > 0 and self._current: + match = OWNER_HREF.match(attrs.get("href", "")) if match: - kill = (self._team, match.group(1)) - if kill not in self._kills: - self._kills.add(kill) + self._current = (*self._current[:2], match.group(1)) + self._flush() + + def _flush(self): + if self._current and all(self._current): + self._kills.add(self._current) + self._current = None @property def kills(self): + self._flush() return self._kills @@ -57,10 +69,14 @@ def main(): related = RelatedParser() related.feed(page) - output = [] - for team, kill in related.kills: + killmails = [] + teams = ([], []) + for team, kill, owner in related.kills: time.sleep(1.05) # Zkillboard is very sensitive. - output.append({"id": kill, "hash": get_hash(kill), "team": team}) + killmails.append({"id": int(kill), "hash": get_hash(kill)}) + destination = teams[team - 1] + if owner not in destination: + destination.append(int(owner)) if args.output: filename = args.output @@ -72,7 +88,7 @@ def main(): filename = "scrapped.json" with open(filename, "w") as file: - file.write(json.dumps(output)) + file.write(json.dumps({"killmails": killmails, "teams": teams})) if __name__ == "__main__": -- cgit v1.1