scrap.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149

import argparse
import json
import re
import sys
import time
from html.parser import HTMLParser

import requests


OWNER_HREF = re.compile(r"/(?:corporation|alliance)/(\d+)/?")
SYSTEM_AND_DATE = re.compile(r"/(\d+)/(\d+)/?$")


def get_hash(kill):
	"""
	Looks up and returns hash of the *kill* using Zkillboard's API.
	"""
	response = requests.get("https://zkillboard.com/api/killID/{}/".format(kill))
	response.raise_for_status()
	data = response.json()
	if len(data) > 1:
		raise ValueError()
	return data[0]["zkb"]["hash"]


class RelatedParser(HTMLParser):
	"""
	Reads kill IDs and teams from Zkillboard's related kills page.
	"""
	def __init__(self):
		super().__init__()
		self._team = 0
		self._kills = set()
		self._current = None

	def handle_starttag(self, tag, attrs):
		attrs = dict(attrs)

		if tag == "table" and attrs.get("id", "").lower() == "killlist":
			self._team += 1

		if tag == "tr" and attrs.get("class", "").lower() == "killlistrow" and self._team > 0:
			self._flush()
			killid = attrs.get("killid", "")
			self._current = (killid, self._team, None)

		if tag == "a" and self._team > 0 and self._current:
			match = OWNER_HREF.match(attrs.get("href", ""))
			if match:
				self._current = (*self._current[:2], match.group(1))
				self._flush()

	def _flush(self):
		if self._current and all(self._current):
			self._kills.add(self._current)
			self._current = None

	@property
	def kills(self):
		"""
		Returns all kills found by the parser along with their team and the ID of the victim.
		"""
		self._flush()
		return self._kills


def get_related_kills(url):
	"""
	Builds basic snapshot containing all killmails from battle report at *url*.
	"""
	response = requests.get(url)
	response.raise_for_status()
	page = response.text
	related = RelatedParser()
	related.feed(page)
	killmails = []
	teams = (set(), set())
	for kill, team, victim in related.kills:
		killmails.append({"id": int(kill)})
		destination = teams[team - 1]
		destination.add(int(victim))
	return {"killmails": killmails, "teams": list(map(list, teams))}


def expand_hashes(snapshot):
	"""
	Expands killmails in *snapshot* IN PLACE by adding their hash based on information from Zkillboard.
	"""
	for killmail in snapshot["killmails"]:
		time.sleep(1.05)  # Zkillboard is very sensitive.
		killmail["hash"] = get_hash(killmail["id"])
	return snapshot


def get_details(kill, hash):
	"""
	Retrieves detailed information about killmail from EVE ESI using killmail's *kill* ID and *hash*.
	"""
	query = "https://esi.evetech.net/latest/killmails/{}/{}/?datasource=tranquility"
	response = requests.get(query.format(kill, hash))
	response.raise_for_status()
	return response.json()


def expand_details(snapshot):
	"""
	Expands killmails in *snapshot* IN PLACE by adding details from EVE ESI. Some data is dropped in process as e.g.,
	full information on attackers is not important in context of the visualizations.
	"""
	for killmail in snapshot['killmails']:
		details = get_details(killmail['id'], killmail['hash'])
		del details['attackers']
		del details['victim']['items']
		del details['victim']['damage_taken']
		killmail.update(details)
	return snapshot


def output_name(args):
	"""
	Generates name of the output file based on the CLI *args*.
	"""
	if args.output:
		return args.output
	match = SYSTEM_AND_DATE.search(args.url)
	if match:
		return "{}_{}.json".format(*match.groups())
	return "a.json"


def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("url")
	parser.add_argument("-o", "--output")
	parser.add_argument("--pretty", action='store_true')
	args = parser.parse_args()
	snapshot = get_related_kills(args.url)
	expand_hashes(snapshot)
	expand_details(snapshot)
	filename = output_name(args)
	with open(filename, "w") as fd:
		opts = {'indent': 4} if args.pretty else {}
		fd.write(json.dumps(snapshot, **opts))
		fd.write("\n")


if __name__ == "__main__":
	main()