#%%capture
%config InlineBackend.figure_format = 'svg'
%matplotlib inline
import scipy as sc, pandas as pd, seaborn as sns
import gpflow as gp
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd
f64 = gp.utilities.to_default_float
survey_max_rank = 10
survey_min_rank = -30
rank_variance = 2
data_point_threshold = 1
linear_regression_threshold = 25
gpr_likelihood_variance_for_filling = 0.03
gpr_likelihood_variance_for_ranking = 0.01
gpr_ls_prior = 10
gpr_ls_prior_delta = 5
sns.set()
Following are some visualizations and tables based on the data gathered in the new Mar-Jun 2020 survey here. The raw data can be accessed here in the form of a .csv file.
The plots were made by mapping kyu ranks to negative integers, so that 1d corresponds to 0 (i.e. 1k -> -1, 2d -> 1). The tables are based on OGS ranks.
data = pd.read_csv('Go rank survey Mar to Jun 2020 en.csv')
#data.head()
data.info()
def mapping(raw_rank):
try:
t = raw_rank[-1]
except:
return raw_rank
if t == 'k':
n = -int(raw_rank[:-1])
else:
n = int(raw_rank[:-1])-1
return n
numerical_rank = data.iloc[:,1:].copy()
numerical_rank = numerical_rank.applymap(mapping)
numerical_rank = numerical_rank.iloc[:,((numerical_rank.shape[0]-numerical_rank.isna().sum())>data_point_threshold).values]
numerical_rank.dropna().shape
numerical_rank_predict = numerical_rank
#numerical_rank
Outliers are detected automatically by finding overall rank difference more than 7 ranks across all platforms. However, due to kyu ranks often flucturating, and GoQuest rank is very unique, hence only those have ranks over dan level (excluding GoQuest ranks) were considered outliers.
numerical_rank_cut = numerical_rank.drop(['GoQuest_rating'], axis=1)
max_rank_all = numerical_rank_cut.max().max()
min_rank_all = numerical_rank_cut.min().min()
print("\nmaximum rank in the survey is {}d, minimum rank is {}k".format(int(max_rank_all+1), int(-min_rank_all)))
rank_diff_row = numerical_rank_cut.max(axis=1) - numerical_rank_cut.min(axis=1)
numerical_rank_cut = numerical_rank.drop(['GoQuest_rank','GoQuest_rating'], axis=1)
rank_diff_row = numerical_rank_cut.max(axis=1) - numerical_rank_cut.min(axis=1)
possible_outlier_row = rank_diff_row[(numerical_rank_cut.max(axis=1)-numerical_rank_cut.min(axis=1))>7]
#finding possible outliers, excluding player ranks below dan level
outlier_list = numerical_rank_cut.iloc[possible_outlier_row.index,:].max(axis=1)
outlier_index = data.iloc[outlier_list[outlier_list>=0].index,:].index
print("outliers index in the csv file row {}".format(outlier_index.values+2))
numerical_rank.drop(outlier_index, inplace=True)
#numerical_rank.drop([103,146,], inplace=True)
#%%capture --no-display
sns.pairplot(numerical_rank.iloc[:,((numerical_rank.shape[0]-numerical_rank.isna().sum())>linear_regression_threshold).values], diag_kind='kde', kind='reg');
def plot(m, X, Y, lo=survey_min_rank, hi=survey_max_rank, xlabel='X', ylabel='Y', sigma=2):
lo = X.min()-abs(X.min()*.5/2)
hi = X.max()+abs(X.max()*.5)
xx = np.linspace(lo, hi, 1000)[:,None]
mean, var = m.predict_y(xx)
p=plt.figure()#figsize=(12, 6))
plt.plot(X, Y, 'kx', mew=2)
plt.plot(xx, mean, 'b', lw=2)
plt.fill_between(xx[:,0], mean[:,0] - sigma*np.sqrt(var[:,0]), mean[:,0] + sigma*np.sqrt(var[:,0]), color='blue', alpha=0.2)
plt.xlim(lo, hi)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
return p
def get_ranks_on_server(rank_start=min_rank_all, rank_end=max_rank_all, fro='OGS', to='Tygem', lo=survey_min_rank, hi=survey_max_rank, k=gp.kernels.RBF, meanf=gp.mean_functions.Zero, prior=(10,1e9), sigma=2):
x=numerical_rank[[fro,to,]].dropna().values
ranks = np.arange(rank_start, rank_end)
#with gp.defer_build():
X=x.T[0].reshape(-1, 1)
Y=x.T[1].reshape(-1, 1)
data = (X, Y)
kern=k(1)
mean_function=meanf()
m=gp.models.GPR(data, kern, mean_function)
m.kernel.lengthscales.prior = tfd.Gamma(f64(prior[0]), f64(prior[1]))
#m.compile()
#gp.utilities.print_summary(m, fmt="notebook")
gp.config.set_default_summary_fmt("notebook")
m.likelihood.variance.assign(gpr_likelihood_variance_for_ranking)
#gp.utilities.print_summary(m.likelihood)
m.trainable_parameters
optz = gp.optimizers.Scipy()
optz.minimize(m.training_loss, variables=m.trainable_variables, options=dict(disp=True, maxiter=1000))
#gp.train.ScipyOptimizer(tol=1e-7).minimize(m)
#print('\n processing {}'.format(to))
#print(m.kernel.lengthscales)
mean, var = m.predict_y(f64(ranks[:,None]))
return mean,(mean[:,0] - sigma*np.sqrt(var[:,0]), mean[:,0] + sigma*np.sqrt(var[:,0])), plot(m, x.T[0], x.T[1], lo=lo, hi=hi, xlabel=fro, ylabel=to, sigma=sigma)
def fillna(a='OGS', b='KGS'):
#with gp.defer_build():
#print("GPR model process for {}".format(a))
X=numerical_rank[[a,b]].dropna()[a].values[:,None]
Y=numerical_rank[[a,b]].dropna()[b].values[:,None]
data = (X, Y)
kern=gp.kernels.RBF(1)
mean_function=gp.mean_functions.Linear()
#noise_variance=0.01
m=gp.models.GPR(data, kern, mean_function)
#m.kern.lengthscales.prior = gp.priors.Gaussian(10,10)
#m.compile()
#gp.utilities.print_summary(m, fmt="notebook")
gp.config.set_default_summary_fmt("notebook")
m.likelihood.variance.assign(gpr_likelihood_variance_for_filling)
#gp.utilities.print_summary(m.likelihood)
#gp.utilities.set_trainable(m.kernel.kernels[1].variance, True)
m.trainable_parameters
optz = gp.optimizers.Scipy()
optz.minimize(m.training_loss, variables=m.trainable_variables, options=dict(disp=True, maxiter=1000))
xx=numerical_rank[numerical_rank[a].notna()&numerical_rank[b].isna()][a].values[:,None]
yy=m.predict_y(xx)[0].numpy()
numerical_rank_predict.loc[numerical_rank[a].notna()&numerical_rank[b].isna(), b]=yy.ravel()
#X.loc[X[a].notna()&X[b].isna(), b] = yy.ravel()
#%%capture
sorted_cols = sorted(numerical_rank.columns, key=lambda x: numerical_rank[x].count(), reverse=True)
print(sorted_cols)
for c in [x for x in sorted_cols if x!='OGS']:
#fillna('MIX',c)
fillna(c,'OGS')
def n_to_rank(n):
if n>survey_max_rank+rank_variance:
return int(round(n))
n=int(round(n))
if n<0:
return f'{abs(n)}k'
else:
return f'{abs(n+1)}d'
def n_to_rank_float(n):
if n>survey_max_rank+rank_variance:
return round(n,1)
n=round(n,1)
if n<0:
return f'{abs(n)}k'
else:
return f'{abs(n+1)}d'
def get_rank_tables(against='KGS', lo=min_rank_all, hi=max_rank_all, prior=(10,1e6)):
meantable = pd.DataFrame()
stdtable = pd.DataFrame()
combinedtable = pd.DataFrame()
for s in numerical_rank_predict.drop(columns=[against,]).columns:
mean,ci,p = get_ranks_on_server(
lo,
hi,
against,
s,
k=lambda x: gp.kernels.RBF(1),
meanf=gp.mean_functions.Linear,
prior=prior,
sigma=1,
)
meantable.insert(loc=0, column=s, value=[f'{n_to_rank(mean[x][0].numpy())}' for x,_ in enumerate(mean)])
stdtable.insert(loc=0, column=s, value=[f'{n_to_rank(ci[0][x].numpy())} - {n_to_rank(ci[1][x].numpy())}' for x,_ in enumerate(ci[0])])
combinedtable.insert(loc=0, column=s, value=[f'{n_to_rank_float(mean[x][0].numpy())} ± {round(ci[1][x].numpy()-mean[x][0].numpy(),1)}' for x,_ in enumerate(mean)])
meantable.insert(loc=0, column=against, value=[f'{n_to_rank(x)}' for x in np.arange(lo,hi)])
stdtable.insert(loc=0, column=against, value=[f'{n_to_rank(x)}' for x in np.arange(lo,hi)])
combinedtable.insert(loc=0, column=against, value=[f'{n_to_rank(x)}' for x in np.arange(lo,hi)])
return meantable, stdtable, combinedtable
The following plots are mainly for visually demonstrating the imprecision of the following estimates.
#%%capture --no-display
a='OGS'
mt1,st1,ct1=get_rank_tables(against=a, prior=(gpr_ls_prior,gpr_ls_prior-gpr_ls_prior_delta))
cols = ['KGS','OGS','IGS','Foxwq','Tygem','WBaduk','DGS','GoQuest_rating', 'GoQuest_rank','EGF','AGA','Taiwan','Japan','China',]
print(cols)
#print(sorted_cols)
mt1[cols].set_index(a)
st1[cols].set_index(a)
ct1[cols].set_index(a)