From da92888c59978ed81254f8359897363bc1fb02ff Mon Sep 17 00:00:00 2001 From: Aki Date: Sat, 3 Apr 2021 03:12:11 +0200 Subject: Added kill scraping script --- scrap.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 scrap.py diff --git a/scrap.py b/scrap.py new file mode 100644 index 0000000..0a2bbae --- /dev/null +++ b/scrap.py @@ -0,0 +1,60 @@ +import argparse +import json +import re +import sys +import time + +import requests + +KILL_HREF = re.compile(r"href=\"/kill/(\d+)/\"") +SYSTEM_AND_DATE = re.compile(r"/(\d+)/(\d+)/$") + + +def unique_kills_in(page): + so_far = set() + for match in KILL_HREF.finditer(page): + kill = match.group(1) + if kill not in so_far: + so_far.add(kill) + yield kill + + +def get_hash(kill): + response = requests.get("https://zkillboard.com/api/killID/{}/".format(kill)) + response.raise_for_status() + data = response.json() + if len(data) > 1: + raise ValueError() + return data[0]["zkb"]["hash"] + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("url") + parser.add_argument("-o", "--output") + args = parser.parse_args() + + response = requests.get(args.url) + response.raise_for_status() + page = response.text + + output = [] + for kill in unique_kills_in(page): + time.sleep(1.05) # Zkillboard is very sensitive. + output.append({"id": kill, "hash": get_hash(kill)}) + + if args.output: + filename = args.output + else: + match = SYSTEM_AND_DATE.search(args.url) + if match: + filename = "{}_{}.json".format(*match.groups()) + else: + filename = "scrapped.json" + + with open(filename, "w") as file: + file.write(json.dumps(output)) + + +if __name__ == "__main__": + main() -- cgit v1.1