Sport

This notebook shows how to use scikit-network to analyse sport data.

We here consider the results of tennis matches of ATP Tour in the period 2001–2016.

[1]:
from IPython.display import SVG
[2]:
import numpy as np
import pandas as pd
from scipy import sparse
[3]:
from sknetwork.data import from_edge_list
from sknetwork.ranking import PageRank, top_k
from sknetwork.topology import get_core_decomposition
from sknetwork.utils import directed2undirected
from sknetwork.embedding import Spectral
from sknetwork.visualization import visualize_graph

Load data

[4]:
filename = 'atp.csv'
[5]:
df = pd.read_csv(filename, sep=';')
[6]:
df.head()
[6]:
ATP Location Tournament Date Series Court Surface Round Best of Winner ... L4 W5 L5 Wsets Lsets Comment MaxW MaxL AvgW AvgL
0 25 Houston U.S. Men's Clay Court Championships 2005-04-21 International Outdoor Clay 2nd Round 3 Haas T. ... NaN NaN NaN 2.0 0.0 Completed NaN NaN NaN NaN
1 26 Estoril Estoril Open 2005-04-27 International Series Outdoor Clay 2nd Round 3 Gaudio G. ... NaN NaN NaN 2.0 0.0 Completed NaN NaN NaN NaN
2 28 Rome Telecom Italia Masters Roma 2005-05-03 Masters Outdoor Clay 1st Round 3 Sanguinetti D. ... NaN NaN NaN 2.0 1.0 Completed NaN NaN NaN NaN
3 28 Rome Telecom Italia Masters Roma 2005-05-04 Masters Outdoor Clay 2nd Round 3 Almagro N. ... NaN NaN NaN 2.0 0.0 Completed NaN NaN NaN NaN
4 29 Hamburg Hamburg TMS 2005-05-11 Masters Outdoor Clay 2nd Round 3 Hrbaty D. ... NaN NaN NaN 2.0 0.0 Completed NaN NaN NaN NaN

5 rows × 32 columns

[7]:
df = df[df['Comment']=='Completed']
[8]:
len(df)
[8]:
42261

Build graph

[9]:
edge_list = list(df[['Winner', 'Loser']].itertuples(index=False, name=None))
[10]:
len(edge_list)
[10]:
42261
[11]:
graph = from_edge_list(edge_list, directed=True)
[12]:
adjacency = graph.adjacency
names = graph.names
[13]:
adjacency
[13]:
<1255x1255 sparse matrix of type '<class 'numpy.int64'>'
        with 28212 stored elements in Compressed Sparse Row format>
[14]:
len(names)
[14]:
1255

Ranking

[15]:
# top-10 players in number of wins
out_weights = adjacency.dot(np.ones(len(names)))
print(names[top_k(out_weights, 10)])
['Federer R.' 'Nadal R.' 'Djokovic N.' 'Ferrer D.' 'Murray A.'
 'Roddick A.' 'Berdych T.' 'Robredo T.' 'Davydenko N.' 'Hewitt L.']
[16]:
# top-10 players in terms of PageRank
pagerank = PageRank()
adjacency_transpose = sparse.csr_matrix(adjacency.T)
scores = pagerank.fit_predict(adjacency_transpose)
print(names[top_k(scores, 10)])
['Federer R.' 'Nadal R.' 'Djokovic N.' 'Murray A.' 'Ferrer D.'
 'Roddick A.' 'Berdych T.' 'Hewitt L.' 'Davydenko N.' 'Wawrinka S.']
[17]:
index = top_k(scores, 10)
sub_adjacency = adjacency[index][:, index]
[18]:
SVG(visualize_graph(sub_adjacency, names=names[index], scores=scores[index]))
[18]:
../_images/use_cases_sport_22_0.svg

Core decomposition

[19]:
adjacency_sym = directed2undirected(adjacency)
[20]:
values = get_core_decomposition(adjacency_sym)
[21]:
print(names[values==np.max(values)])
['Acasuso J.' 'Almagro N.' 'Ancic M.' 'Anderson K.' 'Andreev I.'
 'Andujar P.' 'Baghdatis M.' 'Beck K.' 'Becker B.' 'Bellucci T.'
 'Benneteau J.' 'Berdych T.' 'Berlocq C.' 'Berrer M.' 'Bjorkman J.'
 'Blake J.' 'Bolelli S.' 'Calleri A.' 'Canas G.' 'Chardy J.' 'Chela J.I.'
 'Cilic M.' 'Clement A.' 'Coria G.' 'Cuevas P.' 'Darcis S.' 'Davydenko N.'
 'Del Potro J.M.' 'Dent T.' 'Dimitrov G.' 'Djokovic N.' 'Dodig I.'
 'Dolgopolov O.' 'Falla A.' 'Federer R.' 'Ferrer D.' 'Ferrero J.C.'
 'Fish M.' 'Fognini F.' 'Gabashvili T.' 'Garcia-Lopez G.' 'Gasquet R.'
 'Gaudio G.' 'Gicquel M.' 'Gimeno-Traver D.' 'Ginepri R.' 'Giraldo S.'
 'Golubev A.' 'Gonzalez F.' 'Granollers M.' 'Grosjean S.' 'Gulbis E.'
 'Haas T.' 'Haase R.' 'Hanescu V.' 'Harrison R.' 'Henman T.' 'Hernych J.'
 'Hewitt L.' 'Horna L.' 'Hrbaty D.' 'Isner J.' 'Istomin D.' 'Johansson T.'
 'Karlovic I.' 'Kiefer N.' 'Kohlschreiber P.' 'Korolev E.' 'Koubek S.'
 'Kubot L.' 'Kunitsyn I.' 'Lapentti N.' 'Lee H.T.' 'Ljubicic I.'
 'Llodra M.' 'Lopez F.' 'Lu Y.H.' 'Mahut N.' 'Malisse X.' 'Mannarino A.'
 'Martin A.' 'Massu N.' 'Mathieu P.H.' 'Mayer F.' 'Mayer L.' 'Melzer J.'
 'Mirnyi M.' 'Monaco J.' 'Monfils G.' 'Montanes A.' 'Moya C.' 'Muller G.'
 'Murray A.' 'Nadal R.' 'Nalbandian D.' 'Nieminen J.' 'Nishikori K.'
 'Novak J.' 'Paire B.' 'Pavel A.' 'Petzschner P.' 'Phau B.' 'Querrey S.'
 'Ramirez-Hidalgo R.' 'Raonic M.' 'Robredo T.' 'Rochus C.' 'Rochus O.'
 'Roddick A.' 'Roger-Vasselin E.' 'Rosol L.' 'Russell M.' 'Safin M.'
 'Santoro F.' 'Schuettler R.' 'Sela D.' 'Seppi A.' 'Serra F.' 'Simon G.'
 'Soderling R.' 'Spadea V.' 'Srichaphan P.' 'Stakhovsky S.' 'Starace P.'
 'Stepanek R.' 'Tipsarevic J.' 'Tomic B.' 'Troicki V.' 'Tsonga J.W.'
 'Tursunov D.' 'Verdasco F.' 'Vliegen K.' 'Volandri F.' 'Wawrinka S.'
 'Young D.' 'Youzhny M.' 'Zverev M.']

Embedding

[22]:
spectral = Spectral(2, normalized=False)
[23]:
embedding = spectral.fit_transform(adjacency)
[24]:
mask = values==np.max(values)
[25]:
SVG(visualize_graph(position=embedding[mask], names=names[mask], scores=scores[mask], node_size=5, width=400, height=1000))

[25]:
../_images/use_cases_sport_31_0.svg