#%%capture
%config InlineBackend.figure_format = 'svg'
%matplotlib inline
import scipy as sc, pandas as pd, seaborn as sns
import gpflow as gp
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd
f64 = gp.utilities.to_default_float
survey_max_rank = 10
survey_min_rank = -30
rank_variance = 2
data_point_threshold = 1
linear_regression_threshold = 6
gpr_likelihood_variance_for_filling = 0.03
gpr_likelihood_variance_for_ranking = 0.01
gpr_ls_prior = 10
gpr_ls_prior_delta = 5
sns.set()
Following are some visualizations and tables based on the data gathered in the new Mar-Jun 2020 survey here. The raw data can be accessed here in the form of a .csv file.
The plots were made by mapping kyu ranks to negative integers, so that 1d corresponds to 0 (i.e. 1k -> -1, 2d -> 1). The tables are based on OGS ranks.
data = pd.read_csv('Go rank survey Mar to Jun 2020 en.csv')
data.head()
data.info()
def mapping(raw_rank):
try:
t = raw_rank[-1]
except:
return raw_rank
if t == 'k':
n = -int(raw_rank[:-1])
else:
n = int(raw_rank[:-1])-1
return n
numerical_rank = data.iloc[:,1:].copy()
numerical_rank = numerical_rank.applymap(mapping)
numerical_rank = numerical_rank.iloc[:,((numerical_rank.shape[0]-numerical_rank.isna().sum())>data_point_threshold).values]
numerical_rank.dropna().shape
numerical_rank_predict = numerical_rank
#numerical_rank
Outliers are detected automatically by finding overall rank difference more than 7 ranks across all platforms. However, due to kyu ranks often flucturating, and GoQuest rank is very unique, hence only records with rank difference more than 7 and has a rank over dan level (excluding GoQuest rank) would be considered outliers.
# outliers detection, rank difference across platform greater than 7
print("maximum value in each columns (dan rank -1, negative value is kyu)")
print(numerical_rank.max())
print("\nminimum value in each columns (dan rank -1, negative value is kyu)")
print(numerical_rank.min())
numerical_rank_cut = numerical_rank.drop(['GoQuest_rating'], axis=1)
max_rank_all = numerical_rank_cut.max().max()
min_rank_all = numerical_rank_cut.min().min()
print("\nmaximum rank in the survey is {}d, minimum rank is {}k".format(int(max_rank_all+1), int(-min_rank_all)))
rank_diff_row = numerical_rank_cut.max(axis=1) - numerical_rank_cut.min(axis=1)
#print(numerical_rank_cut.max(axis=1))
#print(numerical_rank_cut.min(axis=1))
#print(rank_diff_row)
#print(rank_diff_row[rank_diff_row>=6])
#print("\n")
numerical_rank_cut = numerical_rank.drop(['GoQuest_rank','GoQuest_rating'], axis=1)
rank_diff_row = numerical_rank_cut.max(axis=1) - numerical_rank_cut.min(axis=1)
possible_outlier_row = rank_diff_row[(numerical_rank_cut.max(axis=1)-numerical_rank_cut.min(axis=1))>7]
#print(possible_outlier_row)
#print(possible_outlier_row.index)
#print(numerical_rank_cut)
print("\nlist of possible outliers with rank difference greater than 7")
data.iloc[possible_outlier_row.index,:]
#finding possible outliers, excluding player ranks below dan level
outlier_list = numerical_rank_cut.iloc[possible_outlier_row.index,:].max(axis=1)
outlier_index = data.iloc[outlier_list[outlier_list>=0].index,:].index
print("outliers index in the csv file row {}".format(outlier_index.values+2))
numerical_rank.drop(outlier_index, inplace=True)
#numerical_rank.drop([103,146,], inplace=True)
print("\nCleaned numerical rank table after dropping outliers")
numerical_rank
#%%capture --no-display
sns.pairplot(numerical_rank.iloc[:,((numerical_rank.shape[0]-numerical_rank.isna().sum())>linear_regression_threshold).values], diag_kind='kde', kind='reg');
def plot(m, X, Y, lo=survey_min_rank, hi=survey_max_rank, xlabel='X', ylabel='Y', sigma=2):
lo = X.min()-abs(X.min()*.5/2)
hi = X.max()+abs(X.max()*.5)
xx = np.linspace(lo, hi, 1000)[:,None]
mean, var = m.predict_y(xx)
p=plt.figure()#figsize=(12, 6))
plt.plot(X, Y, 'kx', mew=2)
plt.plot(xx, mean, 'b', lw=2)
plt.fill_between(xx[:,0], mean[:,0] - sigma*np.sqrt(var[:,0]), mean[:,0] + sigma*np.sqrt(var[:,0]), color='blue', alpha=0.2)
plt.xlim(lo, hi)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
return p
#plot(m, x.T[0], x.T[1])
#sns.regplot(x.T[0], x.T[1])
def get_ranks_on_server(rank_start=min_rank_all, rank_end=max_rank_all, fro='OGS', to='Tygem', lo=survey_min_rank, hi=survey_max_rank, k=gp.kernels.RBF, meanf=gp.mean_functions.Zero, prior=(10,1e9), sigma=2):
x=numerical_rank[[fro,to,]].dropna().values
ranks = np.arange(rank_start, rank_end)
#with gp.defer_build():
X=x.T[0].reshape(-1, 1)
Y=x.T[1].reshape(-1, 1)
data = (X, Y)
kern=k(1)
mean_function=meanf()
m=gp.models.GPR(data, kern, mean_function)
m.kernel.lengthscales.prior = tfd.Gamma(f64(prior[0]), f64(prior[1]))
#m.compile()
#gp.utilities.print_summary(m, fmt="notebook")
gp.config.set_default_summary_fmt("notebook")
m.likelihood.variance.assign(gpr_likelihood_variance_for_ranking)
#gp.utilities.print_summary(m.likelihood)
m.trainable_parameters
optz = gp.optimizers.Scipy()
optz.minimize(m.training_loss, variables=m.trainable_variables, options=dict(disp=True, maxiter=1000))
#gp.train.ScipyOptimizer(tol=1e-7).minimize(m)
print('\n processing {}'.format(to))
print(m.kernel.lengthscales)
mean, var = m.predict_y(f64(ranks[:,None]))
return mean,(mean[:,0] - sigma*np.sqrt(var[:,0]), mean[:,0] + sigma*np.sqrt(var[:,0])), plot(m, x.T[0], x.T[1], lo=lo, hi=hi, xlabel=fro, ylabel=to, sigma=sigma)
def fillna(a='OGS', b='KGS'):
#with gp.defer_build():
print("GPR model process for {}".format(a))
X=numerical_rank[[a,b]].dropna()[a].values[:,None]
Y=numerical_rank[[a,b]].dropna()[b].values[:,None]
data = (X, Y)
kern=gp.kernels.RBF(1)
mean_function=gp.mean_functions.Linear()
#noise_variance=0.01
m=gp.models.GPR(data, kern, mean_function)
#m.kern.lengthscales.prior = gp.priors.Gaussian(10,10)
#m.compile()
#gp.utilities.print_summary(m, fmt="notebook")
gp.config.set_default_summary_fmt("notebook")
m.likelihood.variance.assign(gpr_likelihood_variance_for_filling)
#gp.utilities.print_summary(m.likelihood)
#gp.utilities.set_trainable(m.kernel.kernels[1].variance, True)
m.trainable_parameters
optz = gp.optimizers.Scipy()
optz.minimize(m.training_loss, variables=m.trainable_variables, options=dict(disp=True, maxiter=1000))
xx=numerical_rank[numerical_rank[a].notna()&numerical_rank[b].isna()][a].values[:,None]
yy=m.predict_y(xx)[0].numpy()
#print(yy.shape)
#print(yy)
#print(numerical_rank[a])
#print(numerical_rank[b])
#print(numerical_rank[a].notna()&numerical_rank[b].isna())
#print(numerical_rank_predict)
#print(numerical_rank_predict.loc[numerical_rank[a].notna()&numerical_rank[b].isna(), b])
numerical_rank_predict.loc[numerical_rank[a].notna()&numerical_rank[b].isna(), b]=yy.ravel()
#print(numerical_rank_predict[a])
#print(numerical_rank_predict[b])
#X.loc[X[a].notna()&X[b].isna(), b] = yy.ravel()
#%%capture
sorted_cols = sorted(numerical_rank.columns, key=lambda x: numerical_rank[x].count(), reverse=True)
print(sorted_cols)
for c in [x for x in sorted_cols if x!='OGS']:
#fillna('MIX',c)
fillna(c,'OGS')
def n_to_rank(n):
if n>survey_max_rank+rank_variance:
return int(round(n))
n=int(round(n))
if n<0:
return f'{abs(n)}k'
else:
return f'{abs(n+1)}d'
def n_to_rank_float(n):
if n>survey_max_rank+rank_variance:
return round(n,1)
n=round(n,1)
if n<0:
return f'{abs(n)}k'
else:
return f'{abs(n+1)}d'
def get_rank_tables(against='KGS', lo=min_rank_all, hi=max_rank_all, prior=(10,1e6)):
meantable = pd.DataFrame()
stdtable = pd.DataFrame()
combinedtable = pd.DataFrame()
for s in numerical_rank_predict.drop(columns=[against,]).columns:
mean,ci,p = get_ranks_on_server(
lo,
hi,
against,
s,
k=lambda x: gp.kernels.RBF(1),
meanf=gp.mean_functions.Linear,
prior=prior,
sigma=1,
)
#print("processing {}".format(s))
#print(mean)
#for x,_ in enumerate(mean):
# print(mean[x][0].numpy())
#print(ci[0])
#print(ci[1])
#for x,_ in enumerate(ci[0]):
# print(ci[0][x].numpy())
# print(ci[1][x].numpy())
meantable.insert(loc=0, column=s, value=[f'{n_to_rank(mean[x][0].numpy())}' for x,_ in enumerate(mean)])
stdtable.insert(loc=0, column=s, value=[f'{n_to_rank(ci[0][x].numpy())} - {n_to_rank(ci[1][x].numpy())}' for x,_ in enumerate(ci[0])])
combinedtable.insert(loc=0, column=s, value=[f'{n_to_rank_float(mean[x][0].numpy())} ± {round(ci[1][x].numpy()-mean[x][0].numpy(),1)}' for x,_ in enumerate(mean)])
meantable.insert(loc=0, column=against, value=[f'{n_to_rank(x)}' for x in np.arange(lo,hi)])
stdtable.insert(loc=0, column=against, value=[f'{n_to_rank(x)}' for x in np.arange(lo,hi)])
combinedtable.insert(loc=0, column=against, value=[f'{n_to_rank(x)}' for x in np.arange(lo,hi)])
return meantable, stdtable, combinedtable
The following plots are mainly for visually demonstrating the imprecision of the following estimates.
#%%capture --no-display
a='OGS'
mt1,st1,ct1=get_rank_tables(against=a, prior=(gpr_ls_prior,gpr_ls_prior-gpr_ls_prior_delta))
cols = ['KGS','OGS','IGS','Foxwq','Tygem','WBaduk','DGS','GoQuest_rating', 'GoQuest_rank','EGF','AGA','Taiwan','Japan','China',]
print(cols)
#print(numerical_rank_predict.columns)
#print(sorted_cols)
mt1[cols].set_index(a)
st1[cols].set_index(a)
ct1[cols].set_index(a)