Sport
This notebook shows how to use scikit-network to analyse sport data.
We here consider the results of tennis matches of ATP Tour in the period 2001–2016.
[1]:
from IPython.display import SVG
[2]:
import numpy as np
import pandas as pd
from scipy import sparse
[3]:
from sknetwork.data import from_edge_list
from sknetwork.ranking import PageRank, top_k
from sknetwork.topology import get_core_decomposition
from sknetwork.utils import directed2undirected
from sknetwork.embedding import Spectral
from sknetwork.visualization import visualize_graph
Load data
[4]:
filename = 'atp.csv'
[5]:
df = pd.read_csv(filename, sep=';')
[6]:
df.head()
[6]:
ATP | Location | Tournament | Date | Series | Court | Surface | Round | Best of | Winner | ... | L4 | W5 | L5 | Wsets | Lsets | Comment | MaxW | MaxL | AvgW | AvgL | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 25 | Houston | U.S. Men's Clay Court Championships | 2005-04-21 | International | Outdoor | Clay | 2nd Round | 3 | Haas T. | ... | NaN | NaN | NaN | 2.0 | 0.0 | Completed | NaN | NaN | NaN | NaN |
1 | 26 | Estoril | Estoril Open | 2005-04-27 | International Series | Outdoor | Clay | 2nd Round | 3 | Gaudio G. | ... | NaN | NaN | NaN | 2.0 | 0.0 | Completed | NaN | NaN | NaN | NaN |
2 | 28 | Rome | Telecom Italia Masters Roma | 2005-05-03 | Masters | Outdoor | Clay | 1st Round | 3 | Sanguinetti D. | ... | NaN | NaN | NaN | 2.0 | 1.0 | Completed | NaN | NaN | NaN | NaN |
3 | 28 | Rome | Telecom Italia Masters Roma | 2005-05-04 | Masters | Outdoor | Clay | 2nd Round | 3 | Almagro N. | ... | NaN | NaN | NaN | 2.0 | 0.0 | Completed | NaN | NaN | NaN | NaN |
4 | 29 | Hamburg | Hamburg TMS | 2005-05-11 | Masters | Outdoor | Clay | 2nd Round | 3 | Hrbaty D. | ... | NaN | NaN | NaN | 2.0 | 0.0 | Completed | NaN | NaN | NaN | NaN |
5 rows × 32 columns
[7]:
df = df[df['Comment']=='Completed']
[8]:
len(df)
[8]:
42261
Build graph
[9]:
edge_list = list(df[['Winner', 'Loser']].itertuples(index=False, name=None))
[10]:
len(edge_list)
[10]:
42261
[11]:
graph = from_edge_list(edge_list, directed=True)
[12]:
adjacency = graph.adjacency
names = graph.names
[13]:
adjacency
[13]:
<1255x1255 sparse matrix of type '<class 'numpy.int64'>'
with 28212 stored elements in Compressed Sparse Row format>
[14]:
len(names)
[14]:
1255
Ranking
[15]:
# top-10 players in number of wins
out_weights = adjacency.dot(np.ones(len(names)))
print(names[top_k(out_weights, 10)])
['Federer R.' 'Nadal R.' 'Djokovic N.' 'Ferrer D.' 'Murray A.'
'Roddick A.' 'Berdych T.' 'Robredo T.' 'Davydenko N.' 'Hewitt L.']
[16]:
# top-10 players in terms of PageRank
pagerank = PageRank()
adjacency_transpose = sparse.csr_matrix(adjacency.T)
scores = pagerank.fit_predict(adjacency_transpose)
print(names[top_k(scores, 10)])
['Federer R.' 'Nadal R.' 'Djokovic N.' 'Murray A.' 'Ferrer D.'
'Roddick A.' 'Berdych T.' 'Hewitt L.' 'Davydenko N.' 'Wawrinka S.']
[17]:
index = top_k(scores, 10)
sub_adjacency = adjacency[index][:, index]
[18]:
SVG(visualize_graph(sub_adjacency, names=names[index], scores=scores[index]))
[18]:
Core decomposition
[19]:
adjacency_sym = directed2undirected(adjacency)
[20]:
values = get_core_decomposition(adjacency_sym)
[21]:
print(names[values==np.max(values)])
['Acasuso J.' 'Almagro N.' 'Ancic M.' 'Anderson K.' 'Andreev I.'
'Andujar P.' 'Baghdatis M.' 'Beck K.' 'Becker B.' 'Bellucci T.'
'Benneteau J.' 'Berdych T.' 'Berlocq C.' 'Berrer M.' 'Bjorkman J.'
'Blake J.' 'Bolelli S.' 'Calleri A.' 'Canas G.' 'Chardy J.' 'Chela J.I.'
'Cilic M.' 'Clement A.' 'Coria G.' 'Cuevas P.' 'Darcis S.' 'Davydenko N.'
'Del Potro J.M.' 'Dent T.' 'Dimitrov G.' 'Djokovic N.' 'Dodig I.'
'Dolgopolov O.' 'Falla A.' 'Federer R.' 'Ferrer D.' 'Ferrero J.C.'
'Fish M.' 'Fognini F.' 'Gabashvili T.' 'Garcia-Lopez G.' 'Gasquet R.'
'Gaudio G.' 'Gicquel M.' 'Gimeno-Traver D.' 'Ginepri R.' 'Giraldo S.'
'Golubev A.' 'Gonzalez F.' 'Granollers M.' 'Grosjean S.' 'Gulbis E.'
'Haas T.' 'Haase R.' 'Hanescu V.' 'Harrison R.' 'Henman T.' 'Hernych J.'
'Hewitt L.' 'Horna L.' 'Hrbaty D.' 'Isner J.' 'Istomin D.' 'Johansson T.'
'Karlovic I.' 'Kiefer N.' 'Kohlschreiber P.' 'Korolev E.' 'Koubek S.'
'Kubot L.' 'Kunitsyn I.' 'Lapentti N.' 'Lee H.T.' 'Ljubicic I.'
'Llodra M.' 'Lopez F.' 'Lu Y.H.' 'Mahut N.' 'Malisse X.' 'Mannarino A.'
'Martin A.' 'Massu N.' 'Mathieu P.H.' 'Mayer F.' 'Mayer L.' 'Melzer J.'
'Mirnyi M.' 'Monaco J.' 'Monfils G.' 'Montanes A.' 'Moya C.' 'Muller G.'
'Murray A.' 'Nadal R.' 'Nalbandian D.' 'Nieminen J.' 'Nishikori K.'
'Novak J.' 'Paire B.' 'Pavel A.' 'Petzschner P.' 'Phau B.' 'Querrey S.'
'Ramirez-Hidalgo R.' 'Raonic M.' 'Robredo T.' 'Rochus C.' 'Rochus O.'
'Roddick A.' 'Roger-Vasselin E.' 'Rosol L.' 'Russell M.' 'Safin M.'
'Santoro F.' 'Schuettler R.' 'Sela D.' 'Seppi A.' 'Serra F.' 'Simon G.'
'Soderling R.' 'Spadea V.' 'Srichaphan P.' 'Stakhovsky S.' 'Starace P.'
'Stepanek R.' 'Tipsarevic J.' 'Tomic B.' 'Troicki V.' 'Tsonga J.W.'
'Tursunov D.' 'Verdasco F.' 'Vliegen K.' 'Volandri F.' 'Wawrinka S.'
'Young D.' 'Youzhny M.' 'Zverev M.']
Embedding
[22]:
spectral = Spectral(2, normalized=False)
[23]:
embedding = spectral.fit_transform(adjacency)
[24]:
mask = values==np.max(values)
[25]:
SVG(visualize_graph(position=embedding[mask], names=names[mask], scores=scores[mask], node_size=5, width=400, height=1000))
[25]: