"""Since I use plotly for this project, I imported a special libary to help me render the visualizations as static images, so that
they could be shown through GitHub Pages"""


import plotly.io as pio
pio.renderers.default = "png"
pio.kaleido.scope.default_scale = 3

import pandas as pd

"""The data"""
df = pd.read_csv('games.csv')

print(df.head(5))

         id  rated    created_at  last_move_at  turns victory_status winner  \
0  TZJHLljE  False  1.504210e+12  1.504210e+12     13      outoftime  white   
1  l1NXvwaE   True  1.504130e+12  1.504130e+12     16         resign  black   
2  mIICvQHh   True  1.504130e+12  1.504130e+12     61           mate  white   
3  kWKvrqYL   True  1.504110e+12  1.504110e+12     61           mate  white   
4  9tXo1AUZ   True  1.504030e+12  1.504030e+12     95           mate  white   

  increment_code       white_id  white_rating      black_id  black_rating  \
0           15+2       bourgris          1500          a-00          1191   
1           5+10           a-00          1322     skinnerua          1261   
2           5+10         ischia          1496          a-00          1500   
3           20+0  daniamurashov          1439  adivanov2009          1454   
4           30+3      nik221107          1523  adivanov2009          1469   

                                               moves opening_eco  \
0  d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5...         D10   
1  d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...         B00   
2  e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...         C20   
3  d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...         D02   
4  e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...         C41   

                             opening_name  opening_ply  
0        Slav Defense: Exchange Variation            5  
1  Nimzowitsch Defense: Kennedy Variation            4  
2   King's Pawn Game: Leonardis Variation            3  
3  Queen's Pawn Game: Zukertort Variation            3  
4                        Philidor Defense            5

df = df[[
   'rated',           # rated vs unrated analysis
   'victory_status',  # victory status analysis (which is the condition which ends the game)
   'winner',          
   'white_rating',    
   'black_rating',    
   'moves',          
   'increment_code'   # time controls 
]]

#  total wins:
white_wins = df[df['winner'] == 'white'].shape[0]
black_wins = df[df['winner'] == 'black'].shape[0]
draws = df[df['winner'] == 'draw'].shape[0]
total_games = len(df)

# win conditions:
victory_counts = df['victory_status'].value_counts()
total_games = len(df)

# Collect ELO Distribution Statistics:
white_stats = {
   'mean': df['white_rating'].mean(),
   'highest': df['white_rating'].max(),
   'lowest': df['white_rating'].min()
}

black_stats = {
   'mean': df['black_rating'].mean(),
   'highest': df['black_rating'].max(),
   'lowest': df['black_rating'].min()
}


# Print Results:
print("Game Outcome:")
print(f"White wins: {white_wins} games ({(white_wins/total_games)*100:.2f}%)")
print(f"Black wins: {black_wins} games ({(black_wins/total_games)*100:.2f}%)")
print(f"Draws: {draws} games ({(draws/total_games)*100:.2f}%)")
print()

print("End Condition:")
for status in victory_counts.index:
   count = victory_counts[status]
   percentage = (count/total_games)*100
   print(f"{status}: {count} games ({percentage:.2f}%)")
print()

print("White Rating Statistics:")
print(f"Mean rating: {white_stats['mean']:.2f}")
print(f"Highest rating: {white_stats['highest']}")
print(f"Lowest rating: {white_stats['lowest']}")

print("\nBlack Rating Statistics:")
print(f"Mean rating: {black_stats['mean']:.2f}")
print(f"Highest rating: {black_stats['highest']}")
print(f"Lowest rating: {black_stats['lowest']}")
print()

Game Outcome:
White wins: 10001 games (49.86%)
Black wins: 9107 games (45.40%)
Draws: 950 games (4.74%)

End Condition:
resign: 11147 games (55.57%)
mate: 6325 games (31.53%)
outoftime: 1680 games (8.38%)
draw: 906 games (4.52%)

White Rating Statistics:
Mean rating: 1596.63
Highest rating: 2700
Lowest rating: 784

Black Rating Statistics:
Mean rating: 1588.83
Highest rating: 2723
Lowest rating: 789

# Data for first moves:
first_moves = df['moves'].str.split().str[0].value_counts()
total_games = len(df)

# Data for if the game was rated:
rated_counts = df['rated'].value_counts()
total_games = len(df)

# Data for time controls:
increment_counts = df['increment_code'].value_counts()
total_games = len(df)


# Print our results:
print("First Moves:")
for move in first_moves.index[:8]:
   count = first_moves[move]
   percentage = (count/total_games)*100
   print(f"{move}: {count} games ({percentage:.2f}%)")
print()

print("Game Type:")
for game_type, count in rated_counts.items():
   percentage = (count/total_games)*100
   print(f"{'Rated' if game_type else 'Unrated'}: {count} games ({percentage:.2f}%)")
print()

print("Top 5 Time Controls:")
for increment in increment_counts.index[:8]: 
   count = increment_counts[increment]
   percentage = (count/total_games)*100
   print(f"{increment}: {count} games ({percentage:.2f}%)")

First Moves:
e4: 12598 games (62.81%)
d4: 4522 games (22.54%)
Nf3: 725 games (3.61%)
c4: 716 games (3.57%)
e3: 416 games (2.07%)
g3: 186 games (0.93%)
b3: 173 games (0.86%)
f4: 166 games (0.83%)

Game Type:
Rated: 16155 games (80.54%)
Unrated: 3903 games (19.46%)

Top 5 Time Controls:
10+0: 7721 games (38.49%)
15+0: 1311 games (6.54%)
15+15: 850 games (4.24%)
5+5: 738 games (3.68%)
5+8: 697 games (3.47%)
8+0: 588 games (2.93%)
10+5: 579 games (2.89%)
15+10: 461 games (2.30%)

"""In this code block, I prepare all of our data for visualization"""

# edit dataframe to prepare for visualization
white_wins = df[df['winner'] == 'white'].shape[0]
black_wins = df[df['winner'] == 'black'].shape[0]
draws = df[df['winner'].isna()].shape[0]
total_games = len(df)

outcomes_data = {
   'Outcome': ['White Wins', 'Black Wins', 'Draws'],
   'Games': [white_wins, black_wins, draws],
   'Percentage': [
       f"{(white_wins/total_games)*100:.1f}%",
       f"{(black_wins/total_games)*100:.1f}%",
       f"{(draws/total_games)*100:.1f}%"
   ]
}

# Calculate victory status (condition which the game ends with)
victory_counts = df['victory_status'].value_counts()

# prepare data for victory status pie chart
victory_data = {
   'Status': list(victory_counts.index),
   'Games': list(victory_counts.values),
   'Percentage': [
       f"{(count/total_games)*100:.1f}%" for count in victory_counts.values
   ]
}

# rated vs unrated
rated_counts = df['rated'].value_counts()

# data for game type pie chart
game_type_data = {
   'Type': ['Rated', 'Unrated'],
   'Games': [rated_counts[True], rated_counts[False]],
   'Percentage': [
       f"{(rated_counts[True]/total_games)*100:.1f}%",
       f"{(rated_counts[False]/total_games)*100:.1f}%"
   ]
}

# ratings data
all_ratings = pd.concat([df['white_rating'], df['black_rating']])

# rating stats
stats = {
   'min': all_ratings.min(),
   'max': all_ratings.max(),
   'mean': all_ratings.mean(),
   'Q1': all_ratings.quantile(0.25),
   'Q3': all_ratings.quantile(0.75)
}

#colors for rating stats lines
colors = {
   'min': 'red',
   'max': 'green',
   'mean': 'purple',
   'Q1': 'orange',
   'Q3': 'orange'
}

#  first moves frequency
first_moves = df['moves'].str.split().str[0].value_counts().head(8)
first_moves_percentage = (first_moves/len(df)*100).round(2)

# time controls frequency
time_controls = df['increment_code'].value_counts().head(8)
time_controls_percentage = (time_controls/len(df)*100).round(2)

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Create figure with subplots
fig = make_subplots(
   rows=1, cols=3,
   specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]],
   subplot_titles=('Chess Game Outcomes', 'Victory Status Distribution', 'Distribution of Rated vs Unrated Games'),
   horizontal_spacing=0.05  # Reduce space between plots
)

# First pie chart (Game Outcomes)
fig.add_trace(
   go.Pie(
       labels=outcomes_data['Outcome'],
       values=outcomes_data['Games'],
       textposition='inside',
       textinfo='label+percent',
       hovertemplate="<b>%{label}</b><br>" +
                    "Games: %{value:,}<br>" +
                    "Percentage: %{percent:.1f}%<br>" +
                    "<extra></extra>",
       marker=dict(
           colors=['#184E77', '#34A0A4', '#76C893']
       )
   ),
   row=1, col=1
)

# Second pie chart (Victory Status)
fig.add_trace(
   go.Pie(
       labels=victory_data['Status'],
       values=victory_data['Games'],
       textposition='inside',
       textinfo='label+percent',
       hovertemplate="<b>%{label}</b><br>" +
                    "Games: %{value:,}<br>" +
                    "Percentage: %{percent:.1f}%<br>" +
                    "<extra></extra>",
       marker=dict(
           colors=['#184E77', '#34A0A4', '#76C893', '#D9ED92']
       )
   ),
   row=1, col=2
)

# Third pie chart (Rated vs unrated)
fig.add_trace(
   go.Pie(
       labels=game_type_data['Type'],
       values=game_type_data['Games'],
       textposition='inside',
       textinfo='label+percent',
       hovertemplate="<b>%{label}</b><br>" +
                    "Games: %{value:,}<br>" +
                    "Percentage: %{percent:.1f}%<br>" +
                    "<extra></extra>",
       marker=dict(
           colors=['#184E77', '#76C893']
       )
   ),
   row=1, col=3
)

fig.update_layout(
   height=400,  # Reduced from 500
   width=1000,  # Reduced from 1500
   showlegend=False,
   title_font_size=16,  # Reduced from 20
   margin=dict(t=80, b=20, l=20, r=20)  # Reduce margins
)

# traces
for i in range(len(fig.data)):
   fig.data[i].marker.line = dict(color='#FFFFFF', width=1)

fig.show()

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import numpy as np

# make a figure with subplots
fig = make_subplots(
   rows=2, cols=2,
   specs=[[{'colspan': 2}, None],
          [{}, {}]],
   vertical_spacing=0.15,
   subplot_titles=('Rating Distribution', 'First Moves by Popularity', 'Time Control Distribution')
)

# 1st plot (ratings Distribution)
fig.add_trace(
   go.Histogram(
       x=all_ratings,
       nbinsx=50,
       name='Rating Distribution',
       hovertemplate='Rating: %{x}<br>Count: %{y}<extra></extra>',
       opacity=0.7,
       marker_color='#184E77'
   ),
   row=1, col=1
)

#  vertical lines for stats
for stat_name, value in stats.items():
   fig.add_vline(
       x=value,
       line_dash="dash",
       line_color=colors[stat_name],
       row=1, col=1,
       annotation_text=f"{stat_name}: {value:.0f}",
       annotation_position="top"
   )

# 2nd plot (First Moves)
fig.add_trace(
   go.Bar(
       x=first_moves.index,
       y=first_moves_percentage,
       marker_color='#34A0A4'
   ),
   row=2, col=1
)

# 3rd plot (Time Controls)
fig.add_trace(
   go.Bar(
       x=time_controls.index,
       y=time_controls_percentage,
       marker_color='#76C893'
   ),
   row=2, col=2
)

#  layout
fig.update_layout(
   height=1000,
   width=1200,
   showlegend=False,
   title_x=0.5,
   title_font_size=16,
)

fig.layout.annotations[0].y += 0.05

fig.update_xaxes(title_text="Rating", row=1, col=1)
fig.update_yaxes(title_text="Count", row=1, col=1)
fig.update_yaxes(title_text="Percentage of Games", row=2, col=1)
fig.update_yaxes(title_text="Percentage of Games", row=2, col=2)
fig.update_xaxes(title_text="First Move", row=2, col=1)
fig.update_xaxes(title_text="Time Condition", row=2, col=2)

fig.show()

"""In this cell, we prepare our data to be visualized"""

from plotly.subplots import make_subplots

# first moves calculations
first_moves_of_interest = ['e4', 'd4', 'c4', 'Nf3']
df['first_move'] = df['moves'].str.split().str[0]

# time controls
top_time_controls = df['increment_code'].value_counts().head(5).index.tolist()

# splitting games into bins
bins = 10
df['rating_range'] = pd.qcut(df['white_rating'], q=bins)
results = pd.DataFrame()

for interval in df['rating_range'].unique().categories:
    range_games = df[df['rating_range'] == interval]
    total_games = len(range_games)

    # win %
    white_wins = round((range_games['winner'] == 'white').sum() / total_games * 100, 2)
    black_wins = round((range_games['winner'] == 'black').sum() / total_games * 100, 2)
    draws = round((range_games['winner'] == 'draw').sum() / total_games * 100, 2)

    # rated/unrated game %
    rated_percentage = round((range_games['rated'] == True).sum() / total_games * 100, 2)
    unrated_percentage = round((range_games['rated'] == False).sum() / total_games * 100, 2)

    #  1st move % for top moves
    move_percentages = {
        move: round((range_games['first_move'] == move).sum() / total_games * 100, 2)
        for move in first_moves_of_interest
    }

    # time control % 
    time_control_percentages = {
        tc: round((range_games['increment_code'] == tc).sum() / total_games * 100, 2)
        for tc in top_time_controls
    }

    # win condition %
    victory_percentages = {
        status: round((range_games['victory_status'] == status).sum() / total_games * 100, 2)
        for status in df['victory_status'].unique()
    }

    # data to dict to make visualizations easier
    data_dict = {
        'rating_range': [f"{int(interval.left)}-{int(interval.right)}"],
        'lower_bound': [interval.left],
        'White Wins': [white_wins],
        'Black Wins': [black_wins],
        'Draws': [draws],
        'Rated Games': [rated_percentage],
        'Unrated Games': [unrated_percentage],
        'Number of Games': [total_games]
    }
    data_dict.update(move_percentages)
    data_dict.update(time_control_percentages)
    data_dict.update(victory_percentages)

    # Append the data to  df
    results = pd.concat([results, pd.DataFrame(data_dict)])

# Results by rating range
results = results.sort_values('lower_bound')

from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(
    rows=1, cols=5,
    subplot_titles=('Game Outcomes', 
                    'First Move Played', 
                    'Rated vs Unrated Games',
                    'Time Controls',
                    'Ending Condition'),
    horizontal_spacing=0.05
)

# Colors for all categories
outcome_colors = {'White Wins': '#184E77', 'Black Wins': '#76C893', 'Draws': '#e74c3c'}
move_colors = {'e4': '#f39c12', 'd4': '#9b59b6', 'c4': '#3498db', 'Nf3': '#1abc9c'}
rated_colors = {'Rated Games': '#2ecc71', 'Unrated Games': '#e67e22'}
time_control_colors = dict(zip(top_time_controls, ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEEAD']))
victory_colors = dict(zip(df['victory_status'].unique(), ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']))

# game outcomes (first plot)
for outcome, color in outcome_colors.items():
    fig.add_trace(go.Scatter(
        x=results['rating_range'], 
        y=results[outcome],
        name=outcome, 
        line=dict(color=color, width=2),
        mode='lines+markers', 
        marker=dict(size=8),
        legendgroup='outcomes'
    ), row=1, col=1)

# first moves (second plot)
for move, color in move_colors.items():
    fig.add_trace(go.Scatter(
        x=results['rating_range'], 
        y=results[move],
        name=f'{move} Opening', 
        line=dict(color=color, width=2),
        mode='lines+markers', 
        marker=dict(size=8),
        legendgroup='moves'
    ), row=1, col=2)

# rated status (third plot)
for status, color in rated_colors.items():
    fig.add_trace(go.Scatter(
        x=results['rating_range'], 
        y=results[status],
        name=status, 
        line=dict(color=color, width=2),
        mode='lines+markers', 
        marker=dict(size=8),
        legendgroup='rated'
    ), row=1, col=3)

# time controls (fourth plot)
for tc, color in time_control_colors.items():
    fig.add_trace(go.Scatter(
        x=results['rating_range'], 
        y=results[tc],
        name=f'{tc}', 
        line=dict(color=color, width=2),
        mode='lines+markers', 
        marker=dict(size=8),
        legendgroup='timecontrols'
    ), row=1, col=4)

# win condition (fifth plot)
for status, color in victory_colors.items():
    fig.add_trace(go.Scatter(
        x=results['rating_range'], 
        y=results[status],
        name=f'End: {status}', 
        line=dict(color=color, width=2),
        mode='lines+markers', 
        marker=dict(size=8),
        legendgroup='victory'
    ), row=1, col=5)

#  layout
fig.update_layout(
    height=600,  
    width=2400,
    title_text="Chess Game Analysis by Player Rating",
    title_x=0.5,
    legend=dict(
        orientation="h",  # Horizontal legend
        yanchor="bottom",
        y=-0.8, 
        xanchor="center",
        x=0.5,
        font=dict(size=12),
    ),
    margin=dict(b=200),  # bottom margin for legends
    hovermode='x unified'
)

fig.update_yaxes(title_text="Percentage of Games", ticksuffix="%", range=[0, 100])
fig.update_xaxes(title_text="Player Rating Range", tickangle=45)

fig.show()

# Print detailed statistics
print("\nStatistics by Rating Range:")
print(results.round(2))

Statistics by Rating Range:
  rating_range  lower_bound  White Wins  Black Wins  Draws  Rated Games  \
0     783-1236        784.0       35.44       60.43   4.13        78.15   
0    1236-1351       1236.0       44.18       52.70   3.12        83.70   
0    1351-1441       1351.0       47.85       47.35   4.80        81.10   
0    1441-1500       1441.0       48.84       47.11   4.05        65.64   
0    1500-1567       1500.0       49.57       45.90   4.52        82.93   
0    1567-1651       1567.0       51.54       44.37   4.09        87.57   
0    1651-1737       1651.2       54.67       40.94   4.40        83.26   
0    1737-1844       1737.0       51.65       43.39   4.96        82.97   
0    1844-1979       1844.0       54.38       39.36   6.26        84.03   
0    1979-2700       1979.0       60.60       32.32   7.08        77.36   

   Unrated Games  Number of Games     e4     d4  ...   Nf3   10+0  15+0  \
0          21.85             2009  60.58  18.17  ...  2.64  32.40  6.22   
0          16.30             2019  65.08  18.67  ...  2.43  37.84  7.03   
0          18.90             2000  68.05  15.75  ...  4.05  34.25  6.10   
0          34.36             2148  64.94  17.78  ...  2.75  34.03  6.84   
0          17.07             1880  68.19  21.17  ...  1.70  39.31  5.74   
0          12.43             1979  66.60  21.02  ...  2.58  41.08  6.72   
0          16.74             2025  62.86  25.78  ...  3.70  37.04  7.65   
0          17.03             1996  58.87  29.06  ...  3.31  45.84  7.77   
0          15.97             1997  55.73  30.20  ...  6.86  47.97  5.46   
0          22.64             2005  57.36  28.13  ...  6.08  35.66  5.74   

   15+15   5+5   5+8  outoftime  resign   mate  draw  
0   9.86  2.14  3.93       8.06   45.84  42.16  3.93  
0   4.46  2.08  4.36       8.57   47.55  40.96  2.92  
0   4.55  2.85  4.75       7.40   51.70  36.25  4.65  
0   4.42  3.77  4.93       7.82   50.98  37.43  3.77  
0   3.88  4.36  3.62       7.61   57.02  31.06  4.31  
0   4.35  4.09  3.23       8.99   55.33  31.78  3.89  
0   3.41  4.54  2.57       8.44   59.85  27.41  4.30  
0   3.11  3.51  3.21       8.42   58.97  28.01  4.61  
0   2.35  2.95  2.70       9.31   63.14  21.73  5.81  
0   1.95  6.53  1.35       9.13   65.84  18.00  7.03  

[10 rows x 21 columns]

# rating difference == white - black
df['rating_diff'] = df['white_rating'] - df['black_rating']

# bins for rating differences
bins = 10
df['diff_range'] = pd.qcut(df['rating_diff'], q=bins)

# win percentages for each rating difference range
results = pd.DataFrame()
for interval in df['diff_range'].unique().categories:
   range_games = df[df['diff_range'] == interval]
   total_games = len(range_games)
   
   white_wins = (range_games['winner'] == 'white').sum() / total_games * 100
   black_wins = (range_games['winner'] == 'black').sum() / total_games * 100
   draws = (range_games['winner'] == 'draw').sum() / total_games * 100
   
   results = pd.concat([results, pd.DataFrame({
       'diff_range': [f"{int(interval.left)}-{int(interval.right)}"],
       'lower_bound': [interval.left],
       'White Wins': [white_wins],
       'Black Wins': [black_wins],
       'Draws': [draws],
       'Number of Games': [total_games],
       'Avg Difference': [range_games['rating_diff'].mean()]
   })])

results = results.sort_values('lower_bound')


fig = go.Figure()
fig.add_trace(go.Scatter(
   x=results['Avg Difference'],
   y=results['White Wins'],
   name='White Wins',
   mode='lines+markers',
   line=dict(color='#A9A9A9', width=2),
   marker=dict(size=8)
))

fig.add_trace(go.Scatter(
   x=results['Avg Difference'],
   y=results['Black Wins'],
   name='Black Wins',
   mode='lines+markers',
   line=dict(color='black', width=2),
   marker=dict(size=8)
))

fig.add_trace(go.Scatter(
   x=results['Avg Difference'],
   y=results['Draws'],
   name='Draws',
   mode='lines+markers',
   line=dict(color='#696969', width=2),
   marker=dict(size=8)
))


fig.update_layout(
   title='Game Outcomes by Rating Difference (White Rating - Black Rating)',
   xaxis_title='Rating Difference',
   yaxis_title='Percentage of Games',
   width=1000,
   height=600,
   title_x=0.5,
   legend=dict(
       yanchor="top",
       y=0.99,
       xanchor="right",
       x=0.99
   ),
   yaxis=dict(
       tickformat='.1f',
       ticksuffix='%'
   ),
   hovermode='x unified'
)

fig.show()

# Calculate number of moves from the 'moves' column by counting spaces + 1
df['num_moves'] = df['moves'].str.count(' ') + 1

# Create violin plot comparing game lengths across different victory types
fig = go.Figure()

for victory_type in df['victory_status'].unique():
    moves = df[df['victory_status'] == victory_type]['num_moves']
    
    fig.add_trace(go.Violin(
        x=[victory_type] * len(moves),
        y=moves,
        name=victory_type,
        box_visible=True,
        meanline_visible=True
    ))

fig.update_layout(
    title='Distribution of Game Lengths by Victory Type',
    xaxis_title='Victory Type',
    yaxis_title='Number of Moves',
    width=1000,
    height=600,
    title_x=0.5
)

fig.show()

fig = go.Figure()

for time_control in df['increment_code'].value_counts().head(6).index:
    moves = df[df['increment_code'] == time_control]['num_moves']
    
    fig.add_trace(go.Box(
        y=moves,
        name=time_control,
        boxpoints='outliers'
    ))

fig.update_layout(
    title='Distribution of Game Lengths by Time Control Format',
    xaxis_title='Time Control',
    yaxis_title='Number of Moves',
    width=1000,
    height=600,
    title_x=0.5
)

fig.show()

import pandas as pd

# grandmaster dataset
df2 = pd.read_csv('games2.csv')

# ai games dataset
df3 = pd.read_csv('games3.csv')

print(df2.head(5))
print(df3.head(5))

  player  color                     opponent  player_Elo  opponent_Elo result  \
0    Tal  Black           Feibert Fred (GER)         NaN        2285.0    Win   
1    Tal  Black  Westerinen Heikki M J (FIN)         NaN        2410.0    Win   
2    Tal  Black            Larsen Bent (DEN)         NaN        2565.0    Win   
3    Tal  White  Azmaiparashvili Zurab (GEO)         NaN        2470.0   Draw   
4    Tal  White            Wedberg Tom (SWE)         NaN        2480.0    Win   

                 event                            site        date  \
0            It (open)              Leningrad (Russia)  1991.??.??   
1                   It                 Moscow (Russia)  1982.??.??   
2          It (cat.14)  Bugojno (Bosnia & Herzegovina)  1984.??.??   
3  Memorial V.Goglidze               Tbilisi (Georgia)  1986.??.??   
4            It (open)                  New York (USA)  1990.??.??   

                                               lines  moves  \
0  1. d4 Nf6 2. Nf3 c5 3. c3 e6 4. Bf4 b6 5. e3 B...   63.0   
1  1. e4 c5 2. Nf3 d6 3. d4 cxd4 4. Nxd4 Nf6 5. N...   74.0   
2  1. c4 c5 2. g3 g6 3. Bg2 Bg7 4. e3 Nf6 5. Nc3 ...   56.0   
3  1. d4 d6 2. Nf3 g6 3. g3 Bg7 4. Bg2 Nd7 5. O-O...   31.0   
4  1. Nf3 Nf6 2. c4 e6 3. g3 d5 4. Bg2 Be7 5. O-O...   69.0   

                                           file_name  
0         Feibert-Fred_vs_Tal-Mikhail_1991.__.__.pgn  
1  Westerinen-Heikki-M-J_vs_Tal-Mikhail_1982.__._...  
2          Larsen-Bent_vs_Tal-Mikhail_1984.__.__.pgn  
3  Tal-Mikhail_vs_Azmaiparashvili-Zurab_1986.__._...  
4          Tal-Mikhail_vs_Wedberg-Tom_1990.__.__.pgn  
   game_number winner                                              moves
0            1   Draw  e4 e5 Nf3 Nc6 Bc4 Nf6 Ng5 d5 exd5 Na5 Bb5+ c6 ...
1            2   Draw  e4 c5 Nf3 Nc6 Bb5 g6 Bxc6 bxc6 O-O Bg7 Re1 d6 ...
2            3   Draw  e4 e5 Nf3 Nf6 d4 Nxe4 Nxe5 d5 Bd3 Nd7 Nxd7 Bxd...
3            6   Draw  e4 e5 Nf3 Nc6 Bb5 a6 Ba4 Nf6 O-O Nxe4 d4 b5 Bb...
4            4   Draw  e4 c5 Nc3 Nc6 Nge2 Nd4 d3 g6 h4 Nf6 e5 Nh5 Ne4...

import pandas as pd

#  dataset
df2 = pd.read_csv('games2.csv')

def transform_games2_dataset(df2):
    # remove rows with missing or zero ratings
    df_transformed = df2[
        (df2['player_Elo'].notna()) & 
        (df2['opponent_Elo'].notna()) & 
        (df2['player_Elo'] != 0) & 
        (df2['opponent_Elo'] != 0)
    ].copy()
    
    # determine winner
    def determine_winner(row):
        if row['result'] == 'Draw':
            return 'draw'
        elif row['result'] == 'Win':
            return row['color'].lower()
        return None
    
    # rating to assign
    def assign_ratings(row):
        if row['color'] == 'White':
            return pd.Series({
                'white_rating': row['player_Elo'],
                'black_rating': row['opponent_Elo']
            })
        else:
            return pd.Series({
                'white_rating': row['opponent_Elo'],
                'black_rating': row['player_Elo']
            })
    
    # clean up our chess moves
    def clean_chess_moves(moves_str):
        if pd.isna(moves_str):
            return ""
        parts = moves_str.split()
        cleaned_moves = [part for part in parts if '.' not in part]
        return ' '.join(cleaned_moves)
    
    # Apply changes that we made
    df_transformed['winner'] = df_transformed.apply(determine_winner, axis=1)
    ratings = df_transformed.apply(assign_ratings, axis=1)
    df_transformed['white_rating'] = ratings['white_rating'].astype(int)
    df_transformed['black_rating'] = ratings['black_rating'].astype(int)
    df_transformed['moves'] = df_transformed['lines'].fillna("").apply(clean_chess_moves)
    
    return df_transformed[['winner', 'white_rating', 'black_rating', 'moves']]

df2 = transform_games2_dataset(df2)

# needs to be lowercase to match the other datasets
df3['winner'] = df3['winner'].str.lower()

print("Lichess Games")
print(df.head(2))
print()
print("Grandmaster Games")
print(df2.head(2))
print("AI Games")
print(df3.head(50))

Lichess Games
   rated victory_status winner  white_rating  black_rating  \
0  False      outoftime  white          1500          1191   
1   True         resign  black          1322          1261   

                                               moves increment_code  \
0  d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5...           15+2   
1  d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...           5+10   

  first_move      rating_range  rating_diff       diff_range  num_moves  
0         d4  (1441.0, 1500.0]          309  (293.0, 1499.0]         13  
1         d4  (1236.0, 1351.0]           61     (39.0, 89.0]         16  

Grandmaster Games
   winner  white_rating  black_rating  \
28   draw          2530          2620   
54   draw          2535          2600   

                                                moves  
28  e4 c5 Nf3 d6 d4 cxd4 Qxd4 Nc6 Bb5 Bd7 Bxc6 Bxc...  
54  d4 Nf6 c4 e6 g3 c5 d5 exd5 cxd5 d6 Nc3 g6 Bg2 ...  
AI Games
    game_number winner                                              moves
0             1   draw  e4 e5 Nf3 Nc6 Bc4 Nf6 Ng5 d5 exd5 Na5 Bb5+ c6 ...
1             2   draw  e4 c5 Nf3 Nc6 Bb5 g6 Bxc6 bxc6 O-O Bg7 Re1 d6 ...
2             3   draw  e4 e5 Nf3 Nf6 d4 Nxe4 Nxe5 d5 Bd3 Nd7 Nxd7 Bxd...
3             6   draw  e4 e5 Nf3 Nc6 Bb5 a6 Ba4 Nf6 O-O Nxe4 d4 b5 Bb...
4             4   draw  e4 c5 Nc3 Nc6 Nge2 Nd4 d3 g6 h4 Nf6 e5 Nh5 Ne4...
5             5   draw  e4 e5 Nf3 Nc6 Bb5 a6 Ba4 Nf6 O-O Nxe4 d4 b5 Bb...
6             7   draw  d4 d5 c4 e6 Nc3 Nf6 Bg5 Be7 e3 O-O Nf3 Nbd7 Rc...
7             9   draw  e4 c5 Nf3 Nc6 Bb5 e6 Bxc6 bxc6 d3 Qc7 O-O d6 N...
8             8   draw  e4 e5 Nf3 Nc6 d4 exd4 Nxd4 Nf6 Nxc6 bxc6 Bd3 d...
9            11   draw  e4 e5 Nf3 Nc6 Bc4 Bc5 c3 Nf6 d3 d6 b4 Bb6 a4 a...
10           10   draw  e4 e5 Nf3 Nf6 Nxe5 d6 Nf3 Nxe4 d4 d5 Bd3 Be7 O...
11           12   draw  e4 e5 Nf3 Nc6 Bb5 a6 Ba4 Nf6 O-O Nxe4 d4 b5 Bb...
12           15   draw  e4 e5 Nf3 Nc6 Bc4 Bc5 d3 Nf6 c3 d6 O-O a6 Re1 ...
13           13  white  e4 c5 Nf3 Nc6 Bb5 g6 O-O Bg7 c3 Nf6 Re1 O-O d4...
14           14  white  e4 e5 Nf3 Nc6 Bb5 a6 Ba4 Nf6 O-O Nxe4 d4 b5 Bb...
15           16   draw  e4 e5 Nf3 Nc6 d4 exd4 Nxd4 Nf6 Nxc6 bxc6 e5 Qe...
16           18   draw  e4 e5 Nf3 Nf6 Nxe5 d6 Nf3 Nxe4 d4 d5 Bd3 Be7 O...
17           17   draw  e4 e5 Nf3 Nf6 Nxe5 d6 Nf3 Nxe4 d4 d5 Bd3 Bd6 O...
18           19   draw  e4 e5 Nf3 Nc6 Bb5 a6 Ba4 Nf6 O-O Nxe4 d4 b5 Bb...
19           20  white  e4 e5 Nf3 Nc6 Bb5 a6 Ba4 Nf6 O-O Nxe4 d4 b5 Bb...
20           22   draw  e4 e5 Nf3 Nc6 Bb5 Nf6 O-O Nxe4 Re1 Nd6 Nxe5 Be...
21           21  white  e4 e5 Nf3 Nc6 Bb5 a6 Ba4 Nf6 O-O Nxe4 d4 b5 Bb...
22           24   draw  e4 e5 Nf3 Nf6 Nxe5 d6 Nf3 Nxe4 d4 d5 Bd3 Bf5 O...
23           23   draw  e4 e5 Nf3 Nc6 Bb5 a6 Ba4 Be7 O-O Nf6 Re1 b5 Bb...
24           26   draw  e4 e5 Nf3 Nc6 d4 exd4 Nxd4 Nf6 Nxc6 bxc6 e5 Qe...
25           27   draw  e4 e5 Nf3 Nf6 Nxe5 d6 Nf3 Nxe4 d4 Be7 Bd3 d5 O...
26           25  white  e4 e5 Nf3 Nc6 Bb5 Nf6 O-O Nxe4 Re1 Nd6 Nxe5 Nx...
27           29   draw  e4 e5 Nf3 Nc6 d4 exd4 Nxd4 Nf6 Nxc6 bxc6 e5 Qe...
28           30   draw  e4 e5 Nf3 Nf6 Nxe5 d6 Nf3 Nxe4 d4 d5 Bd3 Be7 O...
29           28   draw  e4 c5 Nf3 Nc6 Bb5 g6 O-O Bg7 c3 Nf6 Re1 O-O d4...
30           31   draw  e4 c5 Nf3 e6 d4 cxd4 Nxd4 a6 c4 Bb4+ Bd2 Qb6 N...
31           32   draw  e4 e5 Nf3 Nc6 Bb5 a6 Ba4 Nf6 O-O b5 Bb3 Bc5 a4...
32           33   draw  e4 e5 Nf3 Nf6 Nxe5 d6 Nf3 Nxe4 d4 d5 Bd3 Bd6 O...
33           34  white  e4 e5 Nf3 Nc6 Bb5 a6 Ba4 Nf6 O-O Be7 Re1 b5 Bb...
34           35   draw  e4 e5 Nf3 Nc6 d4 exd4 Nxd4 Bb4+ c3 Bc5 Nxc6 bx...
35           36   draw  e4 e5 Nf3 Nc6 d4 exd4 Nxd4 Nf6 Nxc6 bxc6 Bd3 d...
36           38   draw  e4 e5 Nf3 Nc6 Bb5 a6 Ba4 Nf6 O-O Be7 Re1 b5 Bb...
37           39   draw  e4 e5 Nf3 Nc6 Bc4 Bc5 c3 Nf6 d3 d6 b4 Bb6 a4 a...
38           40   draw  e4 c5 Nc3 d6 Nge2 e5 Ng3 Be6 Bb5+ Nc6 d3 Nf6 O...
39           41   draw  e4 e5 Nf3 Nc6 Bb5 a6 Ba4 Nf6 O-O b5 Bb3 Bb7 c3...
40           42   draw  e4 e5 Nf3 Nc6 Bb5 a6 Ba4 Nf6 O-O Nxe4 d4 b5 Bb...
41           43   draw  e4 c5 Nf3 Nc6 Bb5 g6 O-O Bg7 c3 Nf6 Re1 O-O d4...
42           44   draw  e4 e5 Nf3 Nf6 Nxe5 d6 Nf3 Nxe4 d4 Be7 Bd3 d5 O...
43           45   draw  e4 e5 Nf3 Nc6 Bb5 a6 Ba4 Nf6 O-O Nxe4 d4 b5 Bb...
44           46   draw  e4 e5 Nf3 Nf6 Nxe5 d6 Nf3 Nxe4 d4 d5 Bd3 Be7 O...
45           37  black  e4 e5 Nf3 Nc6 Bb5 a6 Ba4 Nf6 O-O Nxe4 d4 b5 Bb...
46           48   draw  e4 e5 Nf3 Nc6 Bb5 a6 Ba4 Nf6 O-O Nxe4 d4 b5 Bb...
47           50   draw  e4 e5 Nf3 Nc6 Bb5 a6 Ba4 Nf6 O-O Nxe4 d4 b5 Bb...
48           49   draw  e4 c5 Nf3 Nc6 Bb5 g6 Bxc6 bxc6 O-O Bg7 Re1 Qc7...
49           51  black  e4 c5 Nf3 Nc6 d4 cxd4 Nxd4 Nf6 Nc3 e5 Ndb5 d6 ...

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

ratings_df1 = pd.DataFrame({
    'Dataset': ['Lichess Games'] * 2,
    'Color': ['White', 'Black'],
    'Rating': [round(df['white_rating'].mean(), 2), round(df['black_rating'].mean(), 2)]
})

ratings_df2 = pd.DataFrame({
    'Dataset': ['Grandmaster Games'] * 2,
    'Color': ['White', 'Black'],
    'Rating': [round(df2['white_rating'].mean(), 2), round(df2['black_rating'].mean(), 2)]
})

ratings_combined = pd.concat([ratings_df1, ratings_df2])

# Calculate average moves length for all three datasets with rounding
df['moves_length'] = df['moves'].str.count(' ') + 1
df2['moves_length'] = df2['moves'].str.count(' ') + 1
df3['moves_length'] = df3['moves'].str.count(' ') + 1

moves_df = pd.DataFrame({
    'Dataset': ['Lichess Games', 'Grandmaster Games', 'AI Games'],
    'Average Moves': [round(df['moves_length'].mean(), 2), round(df2['moves_length'].mean(), 2), round(df3['moves_length'].mean(), 2)]
})

# Calculate win percentages for all three datasets with rounding
def get_win_percentages(df):
    total = len(df)
    white_wins = round(len(df[df['winner'] == 'white']) / total * 100, 2)
    black_wins = round(len(df[df['winner'] == 'black']) / total * 100, 2)
    draws = round(len(df[df['winner'] == 'draw']) / total * 100, 2)
    return white_wins, black_wins, draws

wins_df1 = get_win_percentages(df)
wins_df2 = get_win_percentages(df2)
wins_df3 = get_win_percentages(df3)

fig_ratings = go.Figure()
for dataset, color, rating in zip(ratings_combined['Dataset'], ratings_combined['Color'], ratings_combined['Rating']):
    bar_color = 'white' if color == 'White' else 'black'
    outline_color = 'black'
    fig_ratings.add_trace(go.Bar(
        x=[color + " (" + dataset + ")"],
        y=[rating],
        name=f"{dataset} - {color}",
        marker=dict(color=bar_color, line=dict(color=outline_color, width=1.5))
    ))

fig_ratings.update_layout(
    title="Average Ratings by Dataset",
    yaxis_title="Rating",
    xaxis_title="Color"
)

# Moves plot for all three datasets with grayscale and outline
fig_moves = px.bar(
    moves_df, 
    x="Dataset", 
    y="Average Moves", 
    title="Average Number of Moves",
    labels={"Average Moves": "Moves"},
    color_discrete_sequence=["dimgray", "gray", "lightgray"]
)
fig_moves.update_traces(marker_line_width=1.5, marker_line_color="black")

# Win percentage plot for all three datasets with grayscale and outline
fig_wins = go.Figure()
x = ['Lichess Games', 'Grandmaster Games', 'AI Games']
fig_wins.add_trace(go.Bar(
    x=x, 
    y=[wins_df1[0], wins_df2[0], wins_df3[0]], 
    name='White Wins',
    marker=dict(color='white', line=dict(color='black', width=1.5))
))
fig_wins.add_trace(go.Bar(
    x=x, 
    y=[wins_df1[1], wins_df2[1], wins_df3[1]], 
    name='Black Wins',
    marker=dict(color='black', line=dict(color='black', width=1.5))
))
fig_wins.add_trace(go.Bar(
    x=x, 
    y=[wins_df1[2], wins_df2[2], wins_df3[2]], 
    name='Draws',
    marker=dict(color='gray', line=dict(color='black', width=1.5))
))
fig_wins.update_layout(
    title="Game Outcomes",
    xaxis_title="Dataset",
    yaxis_title="Percentage",
    barmode="group"
)
fig_ratings.update_layout(showlegend=True)
fig_moves.update_layout(showlegend=True)
fig_wins.update_layout(showlegend=True)

# Display plots
fig_ratings.show()
fig_moves.show()
fig_wins.show()

import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Get 1st move for both datasets
def get_first_move(moves_str):
    try:
        return moves_str.split()[0]
    except:
        return None

# preparing data from both datasets
df['first_move'] = df['moves'].apply(get_first_move)
df['avg_rating'] = (df['white_rating'] + df['black_rating']) / 2
df['moves_length'] = df['moves'].str.count(' ') + 1

df2['first_move'] = df2['moves'].apply(get_first_move)
df2['avg_rating'] = (df2['white_rating'] + df2['black_rating']) / 2
df2['moves_length'] = df2['moves'].str.count(' ') + 1

#  figure w/ subplots
fig = make_subplots(
    rows=1, cols=3,
    subplot_titles=('Games Starting with e4', 'Games Starting with d4', 'Games Starting with Nf3')
)

# traces are added for each opening move
for idx, opening in enumerate(['e4', 'd4', 'Nf3']):
    # Lichess games
    df_opening = df[df['first_move'] == opening]
    fig.add_trace(
        go.Scatter(
            x=df_opening['avg_rating'],
            y=df_opening['moves_length'],
            mode='markers',
            name=f'Lichess {opening}',
            marker=dict(
                size=6,
                color='rgba(128, 128, 128, 0.5)',
                line=dict(width=0)
            ),
            opacity=0.6,
            showlegend=idx == 0 
        ),
        row=1, col=idx+1
    )
    
    
    df2_opening = df2[df2['first_move'] == opening]
    fig.add_trace(
        go.Scatter(
            x=df2_opening['avg_rating'],
            y=df2_opening['moves_length'],
            mode='markers',
            name=f'GM {opening}',
            marker=dict(
                size=10,
                color='rgba(30, 30, 30, 1)',
                line=dict(width=1, color='rgba(240, 240, 240, 1)')
            ),
            showlegend=idx == 0 
        ),
        row=1, col=idx+1
    )

fig.update_layout(
    height=500,
    width=1500,
    template='plotly_white',
    showlegend=True
)

for i in range(3):
    fig.update_xaxes(title_text="Average Rating of Players", row=1, col=i+1)
    fig.update_yaxes(title_text="Number of Moves" if i==0 else "", row=1, col=i+1)

fig.show()

from scipy import stats

openings = ["e4", "d4", "Nf3"]

def calculate_correlation_stats(data, opening=None):
    if opening:
        data = data[data['first_move'] == opening]
    if len(data) > 0:
        correlation, p_value = stats.pearsonr(data['avg_rating'], 
                                            data['moves_length'])
        n = len(data)
        return correlation, p_value, n
    return None, None, 0

print("Correlation Analysis between Average Rating and Game Length:")
print("\n{:<20} {:<20} {:<15} {:<15} {:<10}".format(
    "Dataset", "Opening Move", "Correlation", "P-value", "n Games"))
print("-" * 80)

# For Lichess Games
# First all games
correlation, p_value, n = calculate_correlation_stats(df)
print("{:<20} {:<20} {:<15.3f} {:<15.3e} {:<10}".format(
    "Lichess", "All Games", correlation, p_value, n
))
# Then by opening
for opening in openings:
    correlation, p_value, n = calculate_correlation_stats(df, opening)
    if correlation is not None:
        print("{:<20} {:<20} {:<15.3f} {:<15.3e} {:<10}".format(
            "Lichess", opening, correlation, p_value, n
        ))

print("\n")  # space between datasets


correlation, p_value, n = calculate_correlation_stats(df2)
print("{:<20} {:<20} {:<15.3f} {:<15.3e} {:<10}".format(
    "GM Games", "All Games", correlation, p_value, n
))

for opening in openings:
    correlation, p_value, n = calculate_correlation_stats(df2, opening)
    if correlation is not None:
        print("{:<20} {:<20} {:<15.3f} {:<15.3e} {:<10}".format(
            "GM Games", opening, correlation, p_value, n
        ))

Correlation Analysis between Average Rating and Game Length:

Dataset              Opening Move         Correlation     P-value         n Games   
--------------------------------------------------------------------------------
Lichess              All Games            0.161           7.048e-116      20058     
Lichess              e4                   0.153           1.008e-66       12598     
Lichess              d4                   0.182           5.257e-35       4522      
Lichess              Nf3                  0.124           8.114e-04       725       


GM Games             All Games            0.072           1.447e-19       15603     
GM Games             e4                   0.063           1.629e-08       8017      
GM Games             d4                   0.067           3.279e-06       4885      
GM Games             Nf3                  0.108           5.672e-05       1381

# Create features from our data
df_model = df.copy()

# Get first move
df_model['first_move'] = df_model['moves'].apply(lambda x: x.split()[0])

# top 4 most common first moves
top_moves = df_model['first_move'].value_counts().nlargest(4).index
df_model = df_model[df_model['first_move'].isin(top_moves)]

# Create color & opponent rating columns
df_model['player_color'] = df_model.apply(
    lambda row: 'white' if row['winner'] == 'white' else 'black', 
    axis=1
)
df_model['opponent_rating'] = df_model.apply(
    lambda row: row['black_rating'] if row['player_color'] == 'white' else row['white_rating'],
    axis=1
)
df_model['player_rating'] = df_model.apply(
    lambda row: row['white_rating'] if row['player_color'] == 'white' else row['black_rating'],
    axis=1
)

print("Dataset Statistics:")
print(f"Total games: {len(df_model):,}")
print("\nGames by first move:")
move_counts = df_model['first_move'].value_counts()
for move in move_counts.index:
    print(f"{move}: {move_counts[move]:,} games ({move_counts[move]/len(df_model)*100:.1f}%)")

Dataset Statistics:
Total games: 18,561

Games by first move:
e4: 12,598 games (67.9%)
d4: 4,522 games (24.4%)
Nf3: 725 games (3.9%)
c4: 716 games (3.9%)

def prepare_knn_data(df):
    """Prepare data for KNN analysis by filtering and creating features"""
    # Create a copy of the dataframe to avoid warnings
    df_copy = df.copy()
    
    # Get first move
    df_copy.loc[:, 'first_move'] = df_copy['moves'].apply(lambda x: x.split()[0])
    
    # Get top 4 most common moves
    top_moves = df_copy['first_move'].value_counts().nlargest(4).index
    df_filtered = df_copy[df_copy['first_move'].isin(top_moves)].copy()
    
    # Create features for matching
    df_filtered.loc[:, 'player_color'] = df_filtered.apply(
        lambda row: 'white' if row['winner'] == 'white' else 'black', axis=1
    )
    
    return df_filtered

def find_nearest_neighbors(target_game, training_data, k=10):
    """Find k nearest neighbors based on matching criteria and rating difference"""
    # Match games with same first move and same winner
    matches = training_data[
        (training_data['first_move'] == target_game['first_move']) &
        (training_data['winner'] == target_game['winner'])
    ].copy()
    
    if len(matches) < k:
        return None
    
    # Calculate rating differences (now always using black's rating as opponent)
    target_opp_rating = target_game['black_rating']
    matches.loc[:, 'rating_diff'] = abs(matches['black_rating'] - target_opp_rating)
    
    # Get k nearest neighbors
    neighbors = matches.nsmallest(k, 'rating_diff')
    return neighbors

def predict_rating(neighbors):
    """Predict rating using weighted average of neighbors"""
    if neighbors is None or len(neighbors) == 0:
        return None
        
    weights = 1 / (neighbors['rating_diff'] + 1)  # Add 1 to avoid division by zero
    ratings = neighbors.apply(
        lambda x: x['white_rating'] if x['player_color'] == 'white' else x['black_rating'],
        axis=1
    )
    
    return (ratings * weights).sum() / weights.sum()

# Prepare data
from sklearn.model_selection import train_test_split

df_knn = prepare_knn_data(df)
train_data, val_data = train_test_split(df_knn, test_size=0.1, random_state=42)

# Make predictions on validation set
predictions = []
actual_ratings = []

for _, game in val_data.iterrows():
    neighbors = find_nearest_neighbors(game, train_data)
    predicted_rating = predict_rating(neighbors)
    
    if predicted_rating is not None:
        predictions.append(predicted_rating)
        actual_ratings.append(
            game['white_rating'] if game['player_color'] == 'white' else game['black_rating']
        )

differences = np.abs(np.array(predictions) - np.array(actual_ratings))
metrics = {
    'within_100': np.mean(differences <= 100) * 100,
    'within_200': np.mean(differences <= 200) * 100,
    'within_300': np.mean(differences <= 300) * 100,
    'median_error': np.median(differences),
    'mean_error': np.mean(differences)
}

# Simple baseline model: predict rating based on opponent's rating
baseline_predictions = []
baseline_actuals = []

for _, game in val_data.iterrows():
    # Get opponent rating
    opponent_rating = game['black_rating'] if game['player_color'] == 'white' else game['white_rating']
    baseline_predictions.append(opponent_rating)
    
    # Get actual rating
    actual_rating = game['white_rating'] if game['player_color'] == 'white' else game['black_rating']
    baseline_actuals.append(actual_rating)

# Calculate baseline metrics
baseline_differences = np.abs(np.array(baseline_predictions) - np.array(baseline_actuals))
baseline_metrics = {
    'within_100': np.mean(baseline_differences <= 100) * 100,
    'within_200': np.mean(baseline_differences <= 200) * 100,
    'within_300': np.mean(baseline_differences <= 300) * 100,
    'median_error': np.median(baseline_differences),
    'mean_error': np.mean(baseline_differences)
}

# Print comparison
print("Model Comparison:\n")
print("Metric                  KNN Model    Baseline Model")
print("-" * 50)
print(f"Within 100 points:     {metrics['within_100']:8.1f}%    {baseline_metrics['within_100']:8.1f}%")
print(f"Within 200 points:     {metrics['within_200']:8.1f}%    {baseline_metrics['within_200']:8.1f}%")
print(f"Within 300 points:     {metrics['within_300']:8.1f}%    {baseline_metrics['within_300']:8.1f}%")
print(f"Median absolute error: {metrics['median_error']:8.1f}    {baseline_metrics['median_error']:8.1f}")
print(f"Mean absolute error:   {metrics['mean_error']:8.1f}    {baseline_metrics['mean_error']:8.1f}")

# Calculate improvement percentages
improvements = {
    'within_100': ((metrics['within_100'] - baseline_metrics['within_100']) / baseline_metrics['within_100']) * 100,
    'within_200': ((metrics['within_200'] - baseline_metrics['within_200']) / baseline_metrics['within_200']) * 100,
    'within_300': ((metrics['within_300'] - baseline_metrics['within_300']) / baseline_metrics['within_300']) * 100,
    'median_error': ((baseline_metrics['median_error'] - metrics['median_error']) / baseline_metrics['median_error']) * 100,
    'mean_error': ((baseline_metrics['mean_error'] - metrics['mean_error']) / baseline_metrics['mean_error']) * 100
}

print("\nKNN Model Improvements:")
print(f"Improvement in within 100: {improvements['within_100']:+.1f}%")
print(f"Improvement in within 200: {improvements['within_200']:+.1f}%")
print(f"Improvement in within 300: {improvements['within_300']:+.1f}%")
print(f"Improvement in median error: {improvements['median_error']:+.1f}%")
print(f"Improvement in mean error: {improvements['mean_error']:+.1f}%")

Model Comparison:

Metric                  KNN Model    Baseline Model
--------------------------------------------------
Within 100 points:         69.9%        47.7%
Within 200 points:         85.2%        70.2%
Within 300 points:         92.7%        83.0%
Median absolute error:      8.2       107.0
Mean absolute error:       82.6       165.4

KNN Model Improvements:
Improvement in within 100: +46.7%
Improvement in within 200: +21.5%
Improvement in within 300: +11.6%
Improvement in median error: +92.3%
Improvement in mean error: +50.1%

Chess Game Data Analysis

By Bobby Becker

Problem

Summary

Data

Objectives

Structure

Additional Recourses

Part One: ETL of our main dataset

Part Two: Tracking Correlations Between Variables

Part Three: Incorporating Additional Datasets

Part Four: Model Exploration