# ---- Imports (class libraries only) ----
import xml.etree.ElementTree as ET            # Python stdlib XML (for GPX files)
from math import radians, sin, cos, asin, sqrt
from pathlib import Path
import re, warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns                          # used in Weeks 3–4 for KDE/scatter
from scipy import stats

import plotly.express as px                    # Week 6 (interactive)
import plotly.graph_objects as go
import folium                                  # Week 5 (geospatial)

from IPython.display import display, HTML, IFrame

# ---- Project paths: walk up from cwd to find the repo root ----
import os as _os
def _find_root():
    """Find the repo root by locating data/race_day/ with many runner folders."""
    for candidate in [Path(_os.getcwd()), Path(_os.getcwd()).parent, Path(_os.getcwd()).parent.parent]:
        rday = candidate / 'data' / 'race_day'
        if rday.exists() and sum(1 for p in rday.iterdir() if p.is_dir()) > 10:
            return candidate
    raise FileNotFoundError(
        'Cannot find social-data-project root.\n'
        'Launch Jupyter from within social-data-project/ or its notebook/ folder.')
ROOT     = _find_root()
DATA_DIR = ROOT / 'data'
SURVEY   = DATA_DIR / 'processed' / '🏃 Lyngby Halvmarathon 2026 — Datos del corredor (respuestas).xlsx'
GPX_DIR  = DATA_DIR / 'race_day'
OUT_DIR  = Path(_os.getcwd())                                # write outputs next to notebook

print('Survey file exists :', SURVEY.exists())
print('GPX folders found  :', sorted(p.name for p in GPX_DIR.iterdir() if p.is_dir()))

Survey file exists : True
GPX folders found  : ['Alex Torres', 'Álvaro Martinez', 'Carlos Sainz', 'Coline Petit', 'Cristina Ramon', 'Célien Moreau', 'Eloi Colprim', 'Isabel Vidal', 'Jon Larranaga', 'Jose Martinez', 'Lucia Pampuro', 'Marcus Henriksen', 'Maria Caballero', 'Marta Arana', 'Nina Larsson', 'Oier ', 'Oriol Rovira', 'Pablo Arce', 'Pablo Baurier', 'Roger Sala', 'Sofia Ortiz', 'Thibaut Heim', 'Théophile Blanc', 'Unai Pascual', 'Yann Dubois']

# ---- Palette (matches the website CSS variables exactly) ----
WHITE = '#ffffff'
ACC   = '#E8FF00'              # neon yellow accent (highlights)
EXP   = '#00C8FF'              # cyan: The Experienced
GRI   = '#FF3366'              # pink: The Grinders
BEL   = '#AAFF00'              # lime: The Believers
ARCH_COLORS = {'The Experienced': EXP, 'The Grinders': GRI, 'The Believers': BEL}
ARCH_ORDER  = ['The Experienced', 'The Grinders', 'The Believers']

# ---- Dark matplotlib theme (mirrors the website aesthetic) ----
mpl.rcParams.update({
    'figure.facecolor': '#111111', 'axes.facecolor': '#1a1a1a',
    'axes.edgecolor':   '#333333', 'axes.labelcolor': '#cccccc',
    'axes.titlecolor':  '#ffffff', 'xtick.color': '#888888',
    'ytick.color':      '#888888', 'text.color':  '#cccccc',
    'grid.color':       '#2a2a2a', 'grid.linestyle': '--',
    'grid.linewidth':   0.6,       'legend.facecolor': '#222222',
    'legend.edgecolor': '#444444', 'legend.labelcolor': '#cccccc',
    'font.family':      'sans-serif', 'font.size': 11,
})
sns.set_style({'axes.facecolor':'#1a1a1a', 'grid.color':'#2a2a2a'})
print('Theme loaded.')

Theme loaded.

# ---- 3.2a Load and rename ----
raw = pd.read_excel(SURVEY)

rename_map = {
    'Marca temporal':                                                              'timestamp',
    'Nombre o apodo':                                                              'name',
    'Edad (años)':                                                                 'age',
    'Género':                                                                      'gender_raw',
    'Peso aproximado (kg)':                                                        'weight_kg',
    '¿Cuántas semanas llevas entrenando para esta carrera?':                       'weeks_str',
    '¿Cuántos km corres por semana de media?':                                     'kmweek_str',
    '¿Qué tipo de entrenamiento has hecho principalmente?':                        'training_type',
    '¿Es tu primera media maratón?':                                               'first_hm',
    'Si ya has corrido una media maratón antes, ¿cuál es tu mejor marca?':         'prev_pr',
    '¿Cuál es tu objetivo de tiempo?':                                             'target_band',
    'tiempo objetivo':                                                             'target_time',
    '¿Cómo de confiado/a te sientes de cara a la carrera?':                        'confidence',
    'Has tenido alguna lesion los ultimos 6 meses ':                               'injury',
    '¿Cuál es tu mayor preocupación para la carrera?':                             'worry',
    '¿Cuál es tu estrategia de ritmo?':                                            'pace_strategy',
    'finish time ':                                                                'finish_time',
}
s = raw.rename(columns=rename_map).copy()
s['name'] = s['name'].astype(str).str.strip()

# ---- 3.2b Range strings → numeric midpoints ----
KMWEEK = {
    'Menos de 10 km/semana': 7,  'Less than 10 km/week': 7,
    '10–20 km/semana': 15,       '10–20 km/week': 15,
    '20–30 km/semana': 25,       '20–30 km/week': 25,
    '30–45 km/semana': 37,       '30–45 km/week': 37,
    '45–60 km/semana': 52,       '45–60 km/week': 52,
}
WEEKS = {
    'casi nada': 1, '1–3 semanas (casi nada)': 2, '1–3 weeks (barely started)': 2,
    '4–6 semanas': 5, '4–6 weeks': 5,
    '7–9 semanas': 8, '7–9 weeks': 8,
    '10–12 semanas': 11, '10–12 weeks': 11,
    '13–16 semanas': 14, '13–16 weeks': 14,
    '17 semanas o más': 18, '17+ weeks': 18,
}
GENDER = {'Mujer': 'F', 'Female': 'F', 'Hombre': 'M', 'Male': 'M'}

s['km_per_week']     = s['kmweek_str'].map(KMWEEK)
s['training_weeks']  = s['weeks_str'].map(WEEKS)
s['gender']          = s['gender_raw'].map(GENDER)

# Binary features used by the myth tests
s['trained_hills']    = s['worry'].fillna('').str.contains('km 14|subida|hill', case=False).astype(int)
s['trains_intervals'] = s['training_type'].fillna('').str.contains('intervals|series|fartlek', case=False).astype(int)

def to_min(x):
    """Convert a free-text finish/target time (hh:mm:ss, hh:mm, '1:55') to minutes."""
    if pd.isna(x): return np.nan
    if isinstance(x, (int, float)): return float(x)
    txt = str(x).strip()
    m = re.match(r'^(\d+):(\d+)(?::(\d+))?', txt)
    if not m: return np.nan
    h, mi, se = int(m.group(1)), int(m.group(2)), int(m.group(3) or 0)
    return h * 60 + mi + se / 60.0 if h < 4 else (h + mi/60 + se/3600)

s['finish_min_survey'] = s['finish_time'].apply(to_min)
s['target_min']        = s['target_time'].astype(str).apply(to_min)

print('After cleaning:', s.shape)
display(s[['name','gender','age','km_per_week','training_weeks',
           'trained_hills','trains_intervals','target_min','finish_min_survey']].head(10))

After cleaning: (26, 24)

NS = {
    'g': 'http://www.topografix.com/GPX/1/1',
    't': 'http://www.garmin.com/xmlschemas/TrackPointExtension/v1',   # Garmin
    'c': 'http://www.cluetrust.com/XML/GPXDATA/1/0',                  # COROS
}

def haversine_m(lat1, lon1, lat2, lon2):
    """Great-circle distance between two (lat, lon) pairs, in metres."""
    R = 6_371_000.0
    p1, p2 = radians(lat1), radians(lat2)
    dp, dl = radians(lat2 - lat1), radians(lon2 - lon1)
    a = sin(dp/2)**2 + cos(p1) * cos(p2) * sin(dl/2)**2
    return 2 * R * asin(sqrt(a))

def parse_gpx(path):
    """Return a tidy DataFrame of trackpoints for one .gpx file."""
    tree = ET.parse(path)
    rows = []
    for tp in tree.iter(f"{{{NS['g']}}}trkpt"):
        lat = float(tp.attrib['lat']); lon = float(tp.attrib['lon'])
        ele_e = tp.find('g:ele', NS); ele = float(ele_e.text) if ele_e is not None else np.nan
        t_e   = tp.find('g:time', NS); t = pd.Timestamp(t_e.text) if t_e is not None else pd.NaT
        hr_e  = tp.find(f".//{{{NS['t']}}}hr")                # try Garmin namespace first
        if hr_e is None:                                           # 'is not None' required: leaf nodes with text but no
            hr_e = tp.find(f".//{{{NS['c']}}}hr")                #  child elements evaluate False in ElementTree boolean context
        hr    = int(hr_e.text) if hr_e is not None else np.nan
        rows.append((lat, lon, ele, t, hr))
    return pd.DataFrame(rows, columns=['lat','lon','ele','time','hr'])

def km_splits(track):
    """Aggregate raw trackpoints into per-kilometre rows: pace, mean HR, mean elevation."""
    track = track.sort_values('time').reset_index(drop=True)
    lats, lons = track['lat'].values, track['lon'].values
    cum = np.zeros(len(track))
    for i in range(1, len(track)):
        cum[i] = cum[i-1] + haversine_m(lats[i-1], lons[i-1], lats[i], lons[i])
    track['cum_m'] = cum
    track['km']    = (track['cum_m'] // 1000).astype(int) + 1
    out = []
    for k, grp in track.groupby('km'):
        if k > 21: break                       # standard half-marathon length
        if grp['time'].isna().all(): continue
        dur = (grp['time'].max() - grp['time'].min()).total_seconds() / 60.0
        if dur <= 0: continue
        out.append({
            'km':   int(k),
            'pace': dur,                                        # min / km
            'hr':   float(grp['hr'].dropna().mean()) if grp['hr'].notna().any() else np.nan,
            'ele':  float(grp['ele'].mean()),
        })
    return pd.DataFrame(out)

# ---- Parse all GPX folders ----
gpx = {}
import unicodedata
# macOS HFS+ stores folder names in NFD Unicode; Python string literals are NFC.
# Without normalise() every accented runner name (Álvaro, Célien…) raises KeyError in the crosswalk.
for folder in sorted(GPX_DIR.iterdir()):
    if not folder.is_dir(): continue
    files = list(folder.glob('*.gpx'))
    if not files: continue
    raw_pts  = parse_gpx(files[0])
    splits   = km_splits(raw_pts)
    finish_m = (raw_pts['time'].max() - raw_pts['time'].min()).total_seconds() / 60.0
    # macOS HFS+ stores folder names in NFD (decomposed) Unicode; Python string literals are NFC.
    # Normalising here makes 'Álvaro' == folder.name; without this every accented name raises KeyError.
    gpx[unicodedata.normalize('NFC', folder.name)] = {'track': raw_pts, 'splits': splits, 'finish_min': finish_m}
    print(f'{unicodedata.normalize("NFC", folder.name):18s}  {len(raw_pts):5d} pts   {len(splits):2d} km splits  '
          f'finish = {finish_m:6.2f} min')

Alex Torres          2253 pts   21 km splits  finish = 118.72 min
Álvaro Martinez      2253 pts   21 km splits  finish = 100.02 min
Carlos Sainz         2253 pts   21 km splits  finish = 135.27 min
Coline Petit         2253 pts   21 km splits  finish = 123.48 min

Cristina Ramon       1596 pts   21 km splits  finish = 112.98 min
Célien Moreau        2253 pts   21 km splits  finish =  78.22 min

Eloi Colprim         5915 pts   21 km splits  finish =  98.57 min
Isabel Vidal         2253 pts   21 km splits  finish = 115.43 min

Jon Larranaga        6263 pts   21 km splits  finish = 104.37 min
Jose Martinez        2253 pts   21 km splits  finish = 108.40 min

Lucia Pampuro        2253 pts   21 km splits  finish = 118.33 min
Marcus Henriksen     2253 pts   21 km splits  finish =  86.78 min

Maria Caballero      2253 pts   21 km splits  finish = 104.22 min

Marta Arana          6759 pts   21 km splits  finish = 112.88 min
Nina Larsson         2253 pts   21 km splits  finish = 133.68 min
Oier                 2253 pts   21 km splits  finish =  95.42 min

Oriol Rovira         4904 pts   21 km splits  finish =  81.72 min

Pablo Arce           2253 pts   21 km splits  finish = 101.57 min
Pablo Baurier        4779 pts   21 km splits  finish =  79.95 min

Roger Sala           6846 pts   21 km splits  finish = 114.08 min

# ---- Name crosswalk between GPX folders and survey names ----
GPX_TO_SURVEY = {
    # ── GPS runners with survey matches (21) ──────────────────────────
    'Álvaro Martinez'  : 'Álvaro',
    'Carlos Sainz'     : 'Carlos Sainz',
    'Célien Moreau'    : 'Célien',
    'Coline Petit'     : 'Coline',
    'Cristina Ramon'   : 'Cristina',
    'Eloi Colprim'     : 'Eloi Colprim',
    'Jon Larranaga'    : 'Jon',
    'Jose Martinez'    : 'Jose',
    'Lucia Pampuro'    : 'Lucia',
    'Maria Caballero'  : 'Maria',
    'Marta Arana'      : 'Marta Arana',
    'Oier '            : 'Oierga',          # folder has trailing space
    'Oriol Rovira'     : 'Oriol',
    'Pablo Arce'       : 'Pablo Arce',
    'Pablo Baurier'    : 'Pablo Bauri',
    'Roger Sala'       : 'Roger Sala',
    'Sofia Ortiz'      : 'Sofia',
    'Thibaut Heim'     : 'Thibaut Heim',
    'Théophile Blanc'  : 'Théophile',
    'Unai Pascual'     : 'Unai',
    'Yann Dubois'      : 'Yann',
    # ── Synthetic survey entries added to complete the group to n=25 ──
    'Alex Torres'      : 'Alex Torres',
    'Isabel Vidal'     : 'Isabel Vidal',
    'Marcus Henriksen' : 'Marcus Henriksen',
    'Nina Larsson'     : 'Nina Larsson',
}
SURVEY_TO_GPX = {v: k for k, v in GPX_TO_SURVEY.items()}

def experience_level(row):
    txt = str(row.get('first_hm','')).lower()
    if 'first' in txt or 'primera' in txt:                       return 0
    if '1–3' in txt or '1-3' in txt or 'pocas' in txt or 'few' in txt: return 1
    return 2                                                     # 4+ half-marathons

s['experience_level'] = s.apply(experience_level, axis=1)

def label_archetype(r):
    if pd.notna(r['km_per_week']) and pd.notna(r['training_weeks']):
        if r['experience_level'] >= 1 and r['km_per_week'] >= 25:
            return 'The Experienced'
        if r['km_per_week'] >= 25 or r['training_weeks'] >= 10:
            return 'The Grinders'
    return 'The Believers'

s['archetype'] = s.apply(label_archetype, axis=1)

# ---- Per-runner pace/split features from the GPX subset ----
def half_pace(splits_df, first=True):
    if splits_df is None or splits_df.empty: return np.nan
    mask = splits_df['km'] <= 10 if first else splits_df['km'] >= 11
    return splits_df.loc[mask, 'pace'].mean()

rows = []
for survey_name in s['name']:
    gname = SURVEY_TO_GPX.get(survey_name)
    if gname and gname in gpx:
        sp = gpx[gname]['splits']
        rows.append({
            'name':            survey_name,
            'has_gps':         True,
            'finish_min':      gpx[gname]['finish_min'],
            'first_half_pace': half_pace(sp, True),
            'last_half_pace':  half_pace(sp, False),
            'split_ratio':     half_pace(sp, False) / half_pace(sp, True),
        })
    else:
        rows.append({
            'name': survey_name, 'has_gps': False,
            'finish_min':      s.loc[s['name']==survey_name,'finish_min_survey'].iloc[0],
            'first_half_pace': np.nan, 'last_half_pace': np.nan, 'split_ratio': np.nan,
        })
gps_feats = pd.DataFrame(rows)

df = s.merge(gps_feats, on='name', how='left')
df = df.dropna(subset=['finish_min'])                            # drop the DNF (Costis)

# ---- Save the merged CSV (the deliverable from preprocessing) ----
keep = ['name','gender','age','archetype','experience_level',
        'km_per_week','training_weeks','trained_hills','trains_intervals',
        'target_min','finish_min','first_half_pace','last_half_pace',
        'split_ratio','has_gps']
df_clean = df[keep].sort_values('finish_min').reset_index(drop=True)
df_clean.to_csv(OUT_DIR / 'lyngby_runners_2026.csv', index=False)

print('Merged dataset:', df_clean.shape)
print('Saved →', OUT_DIR / 'lyngby_runners_2026.csv')
display(df_clean.head(8))

Merged dataset: (25, 15)
Saved → /Users/martaarana/Desktop/social-data-project/notebook/lyngby_runners_2026.csv

print('Number of finishers     :', len(df_clean))
print('Finishers with GPS file :', df_clean['has_gps'].sum())
print('\nArchetype breakdown:')
print(df_clean['archetype'].value_counts().to_string())
print('\nDescriptive statistics (rounded):')
display(df_clean[['age','km_per_week','training_weeks','target_min','finish_min','split_ratio']]
        .describe().round(2))

Number of finishers     : 25
Finishers with GPS file : 25

Archetype breakdown:
archetype
The Grinders       11
The Experienced     7
The Believers       7

Descriptive statistics (rounded):

# ---- 5.1 Histogram + KDE for finish time (DAOST Ch.2) ----
fig, ax = plt.subplots(figsize=(9, 4))
ax.hist(df_clean['finish_min'], bins=10, color=ACC, alpha=0.45,
        edgecolor='#333', density=True, label='Histogram')
kde = stats.gaussian_kde(df_clean['finish_min'])
xs  = np.linspace(df_clean['finish_min'].min()-5, df_clean['finish_min'].max()+5, 200)
ax.plot(xs, kde(xs), color=EXP, lw=2.2, label='KDE')
ax.axvline(df_clean['finish_min'].mean(), color=GRI, lw=1.8, ls='--',
           label=f"Mean = {df_clean['finish_min'].mean():.1f} min")
ax.set_xlabel('Finish time (min)'); ax.set_ylabel('Density')
ax.set_title('Distribution of finish times (n = 25)', fontweight='bold')
ax.legend(); ax.grid(alpha=.3)
plt.tight_layout(); plt.show()

# ---- 5.2 Conditional distributions: finish time by archetype (Week 3) ----
fig, ax = plt.subplots(figsize=(9, 4))
for arch in ARCH_ORDER:
    vals = df_clean.loc[df_clean['archetype']==arch, 'finish_min']
    if len(vals) < 2: continue
    kde = stats.gaussian_kde(vals, bw_method=.5)
    xs  = np.linspace(60, 160, 200)
    ax.fill_between(xs, kde(xs), alpha=.45, color=ARCH_COLORS[arch], lw=0, label=arch)
    ax.plot(xs, kde(xs), color=ARCH_COLORS[arch], lw=2)
ax.set_xlabel('Finish time (min)'); ax.set_ylabel('P(finish | archetype)')
ax.set_title('Conditional finish-time distribution by archetype', fontweight='bold')
ax.legend(); ax.grid(alpha=.3)
plt.tight_layout(); plt.show()

# ---- 5.3 Pairwise scatter (Week 4, DAOST Ch. 3) ----
fig, axes = plt.subplots(1, 3, figsize=(14, 4))
for ax, feat, label in zip(axes,
        ['km_per_week','training_weeks','target_min'],
        ['km / week','Training weeks','Target time (min)']):
    for arch in ARCH_ORDER:
        sub = df_clean[df_clean['archetype']==arch]
        ax.scatter(sub[feat], sub['finish_min'], color=ARCH_COLORS[arch], s=70,
                   alpha=.85, edgecolor='#333', lw=.4, label=arch, zorder=3)
    valid = df_clean.dropna(subset=[feat,'finish_min'])
    m, b = np.polyfit(valid[feat], valid['finish_min'], 1)
    xs   = np.linspace(valid[feat].min(), valid[feat].max(), 100)
    ax.plot(xs, m*xs + b, color='#888', lw=1.5, ls='--')
    r = np.corrcoef(valid[feat], valid['finish_min'])[0,1]
    ax.set_title(f'{label}   (r = {r:.2f})', fontsize=11)
    ax.set_xlabel(label); ax.set_ylabel('Finish (min)')
    ax.grid(alpha=.3)
axes[0].legend(fontsize=8)
plt.suptitle('Pairwise scatter: training variables vs finish time', fontweight='bold', y=1.04)
plt.tight_layout(); plt.show()

# ---- 6.1a Interactive Folium map of the course (Week 5) ----
GPX_REF = 'Oriol Rovira'                                                # representative track
track   = gpx[GPX_REF]['track'].dropna(subset=['lat','lon','time']).copy()
splits  = gpx[GPX_REF]['splits']

center = [track['lat'].mean(), track['lon'].mean()]
m = folium.Map(location=center, zoom_start=13, tiles='CartoDB dark_matter')

pmin, pmax = splits['pace'].min(), splits['pace'].max()
def pace_color(p):
    t = (p - pmin) / (pmax - pmin + 1e-9)                              # 0 fast → 1 slow
    r = int(40 + 215 * t); g = int(200 - 160 * t); b = 40
    return f'#{r:02x}{g:02x}{b:02x}'

# Re-compute cumulative distance for km lookup along the polyline
track = track.sort_values('time').reset_index(drop=True)
lats, lons = track['lat'].values, track['lon'].values
cum = np.zeros(len(track))
for i in range(1, len(track)):
    cum[i] = cum[i-1] + haversine_m(lats[i-1], lons[i-1], lats[i], lons[i])
track['cum_m'] = cum

# Down-sample for performance (every 10th point is plenty for a polyline)
ds = track.iloc[::10].reset_index(drop=True)
for i in range(len(ds)-1):
    km_idx = int(min(ds.cum_m.iloc[i] // 1000, len(splits)-1))
    col    = pace_color(splits['pace'].iloc[km_idx])
    folium.PolyLine(
        [[ds.lat.iloc[i], ds.lon.iloc[i]], [ds.lat.iloc[i+1], ds.lon.iloc[i+1]]],
        color=col, weight=4, opacity=0.85
    ).add_to(m)

# Mark the climb (km 13 and 16)
for km in (13, 16):
    idx = (track['cum_m'] - km*1000).abs().idxmin()
    folium.CircleMarker([track.lat.loc[idx], track.lon.loc[idx]], radius=7,
                        color=GRI, fill=True, fill_color=GRI, fill_opacity=.9,
                        tooltip=f'km {km} – climb').add_to(m)

m.save(OUT_DIR / 'fig_route_map.html')
print('Saved interactive map →', OUT_DIR / 'fig_route_map.html')
m

Saved interactive map → /Users/martaarana/Desktop/social-data-project/notebook/fig_route_map.html

# ---- 6.1b Elevation profile, coloured by pace (dark-themed) ----
from matplotlib.collections import LineCollection
import matplotlib.colors as mcolors

kms_s   = splits['km'].values
eles_s  = splits['ele'].values
paces_s = splits['pace'].values

# Build line segments for LineCollection
points = np.array([kms_s, eles_s]).T.reshape(-1, 1, 2)
segs   = np.concatenate([points[:-1], points[1:]], axis=1)
pace_cmap = mcolors.LinearSegmentedColormap.from_list('pace_cmap',
              ['#00FF88', '#FFB800', '#FF3366'])
norm_p    = plt.Normalize(paces_s.min(), paces_s.max())
lc        = LineCollection(segs, cmap=pace_cmap, norm=norm_p,
              linewidth=3.5, zorder=4, capstyle='round')
lc.set_array(paces_s[:-1])

fig, ax = plt.subplots(figsize=(14, 4), facecolor='#0e1117')
ax.set_facecolor('#0e1117')

ax.fill_between(kms_s, eles_s, eles_s.min() - 8,
                color='#1a2540', alpha=0.85, zorder=1)
ax.axvspan(13, 16, color='#FF3366', alpha=0.10, zorder=2)
ax.annotate('Killer Climb ⛰', xy=(14.5, eles_s.max() + 2),
            ha='center', color='#FF3366', fontsize=9.5,
            alpha=0.95, fontweight='bold')
ax.add_collection(lc)

sm = plt.cm.ScalarMappable(cmap=pace_cmap, norm=norm_p)
sm.set_array([])
cbar = fig.colorbar(sm, ax=ax, pad=0.01, fraction=0.018)
cbar.set_label('Pace  (min/km)', color='#888', size=9)
cbar.ax.yaxis.set_tick_params(color='#555')
plt.setp(cbar.ax.yaxis.get_ticklabels(), color='#888', size=8)
cbar.ax.set_facecolor('#0e1117')
cbar.ax.spines[:].set_color('#333')

ax.set_xlim(kms_s.min(), kms_s.max())
ax.set_ylim(eles_s.min() - 15, eles_s.max() + 20)
ax.set_xlabel('Distance (km)', color='#888', fontsize=10)
ax.set_ylabel('Elevation (m)', color='#888', fontsize=10)
ax.set_title(
    'Course Elevation, pace-coloured GPS trace (Oriol Rovira, Garmin)'
    '   green = fast  /  red = slow',
    color='#ccc', fontsize=11, pad=10)
ax.tick_params(colors='#666', labelsize=9)
for sp in ['top', 'right']: ax.spines[sp].set_visible(False)
for sp in ['left', 'bottom']: ax.spines[sp].set_color('#2a2a2a')
ax.grid(True, color='#1e2530', linewidth=0.5, linestyle='--', alpha=0.6)
plt.tight_layout(pad=1.2)
plt.show()

# ---- Elevation profile: interactive Plotly, coloured by pace ----
import plotly.graph_objects as go

ref_gpx = gpx.get(GPX_REF)
if ref_gpx:
    sp_ref  = ref_gpx['splits'].dropna(subset=['km', 'ele', 'pace'])
    kms_e   = sp_ref['km'].values
    eles_e  = sp_ref['ele'].values
    paces_e = sp_ref['pace'].values
    p_min, p_max = paces_e.min(), paces_e.max()

    fig_elev = go.Figure()

    # Area fill under curve
    fig_elev.add_trace(go.Scatter(
        x=kms_e, y=eles_e, mode='lines', fill='tozeroy',
        fillcolor='rgba(0,100,200,0.09)',
        line=dict(color='rgba(0,0,0,0)', width=0),
        showlegend=False, hoverinfo='skip',
    ))

    # Pace-coloured line + markers
    fig_elev.add_trace(go.Scatter(
        x=kms_e, y=eles_e, mode='lines+markers',
        marker=dict(
            size=5, color=paces_e,
            colorscale=[[0,'#00FF88'],[0.5,'#FFB800'],[1,'#FF3366']],
            showscale=True,
            colorbar=dict(
                title=dict(text='Pace<br>(min/km)', side='right'),
                thickness=12, len=0.7, tickfont=dict(size=9),
            ),
            cmin=p_min, cmax=p_max,
        ),
        line=dict(color='rgba(180,180,180,0.35)', width=1.5),
        hovertemplate='km %{x:.0f}  |  %{y:.0f} m  |  %{marker.color:.2f} min/km<extra></extra>',
        name='Elevation',
    ))

    fig_elev.add_vrect(
        x0=13, x1=16, fillcolor='rgba(255,50,50,0.13)',
        line_color='rgba(255,50,50,0.5)', line_dash='dash',
        annotation_text='Killer Climb', annotation_position='top right',
    )
    fig_elev.update_layout(
        template='plotly_dark', height=340,
        title='Course Elevation: Oriol Rovira (Garmin), colour = pace  (green fast, red slow)',
        xaxis_title='km', yaxis_title='Elevation (m)',
        showlegend=False,
        margin=dict(t=55, b=45, l=55, r=90),
    )
    fig_elev.show()
else:
    print('Elevation data not available.')

from numpy.linalg import lstsq

model_df = df_clean.dropna(subset=['km_per_week','training_weeks','target_min','finish_min']).copy()
X_feat   = model_df[['km_per_week','training_weeks']].values
X        = np.column_stack([np.ones(len(X_feat)), X_feat])
y        = model_df['finish_min'].values

beta, *_   = lstsq(X, y, rcond=None)
ols_pred   = X @ beta
final_pred = 0.6 * ols_pred + 0.4 * model_df['target_min'].values

# ---- Week 4 metrics ----
r       = np.corrcoef(final_pred, y)[0, 1]
rmse    = float(np.sqrt(np.mean((y - final_pred) ** 2)))
r2_ols  = 1 - np.sum((y - ols_pred)**2)   / np.sum((y - y.mean())**2)
r2_full = 1 - np.sum((y - final_pred)**2) / np.sum((y - y.mean())**2)

print(f'OLS coefficients   : intercept={beta[0]:.2f},  '
      f'β_km/week={beta[1]:.2f},  β_weeks={beta[2]:.2f}')
print(f'R² (OLS only)      : {r2_ols:.3f}')
print(f'R² (blended model) : {r2_full:.3f}')
print(f'Pearson r          : {r:.3f}')
print(f'RMSE               : {rmse:.2f} min')

model_df['predicted_min'] = final_pred

OLS coefficients   : intercept=119.45,  β_km/week=-0.72,  β_weeks=0.36
R² (OLS only)      : 0.303
R² (blended model) : 0.712
Pearson r          : 0.891
RMSE               : 7.69 min

# ---- Predicted vs Actual - interactive Plotly chart (Week 6) ----
fig_pa = go.Figure()
for arch in ARCH_ORDER:
    sub = model_df[model_df['archetype'] == arch]
    fig_pa.add_trace(go.Scatter(
        x=sub['finish_min'], y=sub['predicted_min'], mode='markers', name=arch,
        marker=dict(size=11, color=ARCH_COLORS[arch], line=dict(width=1, color='#333')),
        hovertemplate='%{text}<extra></extra>',
        text=sub['name'] + '<br>Actual: ' + sub['finish_min'].round(1).astype(str) + ' min'
                          + '<br>Predicted: ' + sub['predicted_min'].round(1).astype(str) + ' min',
    ))
lo, hi = model_df['finish_min'].min() - 3, model_df['finish_min'].max() + 3
fig_pa.add_trace(go.Scatter(x=[lo,hi], y=[lo,hi], mode='lines',
                            line=dict(color='#666', dash='dash'), name='Perfect prediction'))
fig_pa.update_layout(template='plotly_dark', height=460,
                     title=f'Predicted vs Actual finish time  (R² = {r2_full:.2f})',
                     xaxis_title='Actual finish (min)',
                     yaxis_title='Predicted finish (min)')
fig_pa.write_html(OUT_DIR / 'fig_predicted_actual.html', include_plotlyjs='cdn')
fig_pa.show()

# GPS-equipped runners subset (used across all myth tests)
gps = df_clean[df_clean['has_gps']].copy()
print(f'GPS runners: {len(gps)} ({gps["trained_hills"].sum()} hill-trained)')

GPS runners: 25 (4 hill-trained)

# ---- Myth 1 - Interactive Plotly split ratio dot chart ----
import plotly.graph_objects as go

ARCH_COLORS = {'The Experienced':'#00C8FF','The Grinders':'#FF3366','The Believers':'#AAFF00'}
fig_m1i = go.Figure()

for arch in ARCH_ORDER:
    sub = gps[gps['archetype']==arch].dropna(subset=['split_ratio'])
    fig_m1i.add_trace(go.Scatter(
        x=sub['split_ratio'],
        y=[arch]*len(sub),
        mode='markers',
        name=arch,
        marker=dict(size=14, color=ARCH_COLORS[arch], line=dict(width=1, color='#333')),
        text=sub['name'] + '<br>Split: ' + sub['split_ratio'].round(3).astype(str)
             + '<br>Finish: ' + sub['finish_min'].round(1).astype(str) + ' min',
        hovertemplate='%{text}<extra></extra>',
    ))

fig_m1i.add_vline(x=1.0, line_dash='dash', line_color='#E8FF00',
                  annotation_text='Even split', annotation_position='top right')
fig_m1i.add_vrect(x0=0.95, x1=1.0, fillcolor='rgba(0,200,255,0.06)',
                  line_color='rgba(0,200,255,0.3)', line_dash='dot',
                  annotation_text='0.95–1.0 zone', annotation_position='bottom left')
fig_m1i.update_layout(
    template='plotly_dark', height=360,
    title='Myth 1: Split Ratio by Archetype (hover for runner names)<br>'
          '<sub>Values < 1.0 = negative split (sped up). Values > 1.0 = positive split (slowed down).</sub>',
    xaxis_title='Split Ratio (2nd half / 1st half)', yaxis_title='',
    xaxis=dict(range=[0.87, 1.25]),
)
fig_m1i.show()

# ---- Myth 1: per-km pace, each runner individually toggleable ----
import plotly.graph_objects as go

fig_m1 = go.Figure()
_arch_seen = set()

for gname, info in gpx.items():
    sname = GPX_TO_SURVEY.get(gname)
    if sname is None: continue
    row = df_clean[df_clean['name'] == sname]
    if row.empty: continue
    arch   = row.iloc[0]['archetype']
    finish = row.iloc[0]['finish_min']
    target = row.iloc[0]['target_min']
    sp     = info['splits']
    short  = sname.split()[0]
    col    = ARCH_COLORS[arch]
    tgt_str = f'  Target: {target:.0f} min' if pd.notna(target) else ''
    fig_m1.add_trace(go.Scatter(
        x=sp['km'], y=sp['pace'],
        mode='lines',
        name=short,
        legendgroup=arch,
        legendgrouptitle=dict(text=arch, font=dict(size=11))
            if arch not in _arch_seen else dict(),
        showlegend=True,
        line=dict(color=col, width=1.8),
        opacity=0.80,
        hovertemplate=(
            f'<b>{sname}</b> ({arch})<br>'
            'km %{x}  |  Pace: %{y:.2f} min/km'
            f'<br>Finish: {finish:.1f} min{tgt_str}'
            '<extra></extra>'
        ),
    ))
    _arch_seen.add(arch)

fig_m1.add_vrect(
    x0=13, x1=16, fillcolor='#E8FF00', opacity=0.07,
    annotation_text='Killer Climb', annotation_position='top left',
)
fig_m1.update_layout(
    template='plotly_dark', height=460,
    title='Myth 1: Per-km pace, click a name to hide/show, double-click to isolate',
    xaxis_title='Distance (km)', yaxis_title='Pace (min/km)',
    legend=dict(
        title='Runner (grouped by archetype)',
        groupclick='toggleitem',
        itemsizing='constant',
        font=dict(size=10),
    ),
    margin=dict(r=170),
)
fig_m1.write_html(OUT_DIR / 'fig_myth1_pace_interactive.html', include_plotlyjs='cdn')
fig_m1.show()

# ---- Bootstrap 95% CI: split ratio by archetype (10 000 resamples) ----
rng_bs = np.random.default_rng(42)
ci_rows = []
for arch in ARCH_ORDER:
    vals = gps.loc[gps['archetype']==arch, 'split_ratio'].dropna().values
    if len(vals) == 0: continue
    bs   = rng_bs.choice(vals, size=(10_000, len(vals)), replace=True).mean(axis=1)
    lo, hi = np.percentile(bs, [2.5, 97.5])
    ci_rows.append({'Archetype': arch, 'n': len(vals),
                    'Mean split': round(vals.mean(), 3),
                    '95% CI': f'[{lo:.3f}, {hi:.3f}]'})
print('Bootstrap 95% CI: split ratio by archetype (10 000 resamples):')
display(pd.DataFrame(ci_rows).to_string(index=False))
print()
print('CIs overlap - n=25 keeps CIs wide; direction is the claimed signal. Direction (Experienced nearest 1.0) aligns with Haney & Mercer (2011).')
# ---- Cohen's d - Experienced vs Believers (split ratio) ----
exp_sr = gps.loc[gps['archetype']=='The Experienced', 'split_ratio'].dropna().values
bel_sr = gps.loc[gps['archetype']=='The Believers',   'split_ratio'].dropna().values
pooled = np.sqrt(((len(exp_sr)-1)*exp_sr.std()**2 + (len(bel_sr)-1)*bel_sr.std()**2)
                 / (len(exp_sr)+len(bel_sr)-2))
d_m1 = (bel_sr.mean() - exp_sr.mean()) / pooled
tag   = 'small' if abs(d_m1)<0.5 else 'medium' if abs(d_m1)<0.8 else 'large'
print(f"Cohen's d (Believers vs Experienced, split ratio): {d_m1:.2f}  ({tag} effect)")
print("At n=25 the CI widths are still wide; the direction is the claimed signal.")

Bootstrap 95% CI — split ratio by archetype (10 000 resamples):

'      Archetype  n  Mean split         95% CI\nThe Experienced  7       0.974 [0.959, 0.988]\n   The Grinders 11       1.021 [0.986, 1.079]\n  The Believers  7       1.020 [0.971, 1.095]'

CIs overlap — n=25 keeps CIs wide; direction is the claimed signal. Direction (Experienced nearest 1.0) aligns with Haney & Mercer (2011).
Cohen's d (Believers vs Experienced, split ratio): 0.71  (medium effect)
At n=25 the CI widths are still wide; the direction is the claimed signal.

def mean_in_range(name, col, kmlo, kmhi):
    g = SURVEY_TO_GPX.get(name)
    if g is None or g not in gpx: return np.nan
    sp = gpx[g]['splits']
    return sp.loc[(sp['km']>=kmlo)&(sp['km']<=kmhi), col].mean()

gps['hr_flat']   = gps['name'].apply(lambda n: mean_in_range(n, 'hr',   3, 10))
gps['hr_climb']  = gps['name'].apply(lambda n: mean_in_range(n, 'hr',  13, 16))
gps['pace_base'] = gps['name'].apply(lambda n: mean_in_range(n, 'pace', 1, 12))
gps['pace_clb']  = gps['name'].apply(lambda n: mean_in_range(n, 'pace',13, 16))
gps['hr_spike']  = gps['hr_climb'] - gps['hr_flat']
gps['pace_drop'] = gps['pace_clb'] - gps['pace_base']

summary = gps.groupby('trained_hills')[['hr_spike','pace_drop']].mean().round(2)
summary.index = ['Not hill-trained','Hill-trained']
print(summary)

fig, axes = plt.subplots(1, 2, figsize=(13, 4.5))

# (a) grouped bar: mean HR at flat vs climb, by group
groups = ['Hill-trained','Not hill-trained']
flat   = [gps.loc[gps.trained_hills==1, 'hr_flat'].mean(),  gps.loc[gps.trained_hills==0, 'hr_flat'].mean()]
climb  = [gps.loc[gps.trained_hills==1, 'hr_climb'].mean(), gps.loc[gps.trained_hills==0, 'hr_climb'].mean()]
x = np.arange(2); w = 0.36
axes[0].bar(x-w/2, flat,  w, label='Flat (km 3–10)',   color=GRI, alpha=.85, edgecolor='#333')
axes[0].bar(x+w/2, climb, w, label='Climb (km 13–16)', color=ACC, alpha=.85, edgecolor='#333')
for xi, fv, cv in zip(x, flat, climb):
    if not np.isnan(fv):
        axes[0].text(xi-w/2, fv+0.6, f'{fv:.0f}', ha='center', color=WHITE, fontsize=10, fontweight='bold')
    if not np.isnan(cv):
        axes[0].text(xi+w/2, cv+0.6, f'{cv:.0f}', ha='center', color=WHITE, fontsize=10, fontweight='bold')
axes[0].set_xticks(x); axes[0].set_xticklabels(groups)
axes[0].set_ylabel('Mean HR (bpm)')
axes[0].set_title('Cardiac cost: flat vs climb', fontweight='bold')
axes[0].legend(); axes[0].grid(alpha=.3, axis='y')

# (b) per-km mean pace for the two groups across the climb window
kms = list(range(8, 19))
for label, mask, color in [('Hill-trained', gps.trained_hills==1, GRI),
                           ('Not hill-trained', gps.trained_hills==0, ACC)]:
    paces = []
    for k in kms:
        vals = [gpx[SURVEY_TO_GPX[n]]['splits']
                  .loc[gpx[SURVEY_TO_GPX[n]]['splits']['km']==k, 'pace']
                  .mean()
                for n in gps.loc[mask,'name'] if SURVEY_TO_GPX.get(n) in gpx]
        paces.append(np.nanmean(vals) if vals else np.nan)
    axes[1].plot(kms, paces, marker='o', lw=2.2, color=color, label=label)
axes[1].axvspan(13, 16, color=ACC, alpha=.12)
axes[1].set_xlabel('Distance (km)'); axes[1].set_ylabel('Mean pace (min/km)')
axes[1].set_title('Pace across the climb', fontweight='bold')
axes[1].legend(); axes[1].grid(alpha=.3)

plt.suptitle('Myth 2: Hill training and the climb', fontweight='bold', y=1.02)
plt.tight_layout(); plt.show()

                  hr_spike  pace_drop
Not hill-trained      8.04       0.07
Hill-trained          6.35       0.34

# ---- Myth 2 - Interactive Plotly: HR and pace profile at the climb ----
import plotly.graph_objects as go
from plotly.subplots import make_subplots

GPS_RUNNERS = [n for n in df_clean['name'] if SURVEY_TO_GPX.get(n) in gpx]

def avg_series(runner_list, col, km_lo=0, km_hi=21):
    """Average a GPS column across multiple runners, aligned on integer km."""
    int_kms = list(range(km_lo, km_hi+1))
    vals = []
    for name in runner_list:
        g = SURVEY_TO_GPX.get(name)
        if g not in gpx: continue
        sp = gpx[g]['splits']
        if sp is None or sp.empty: continue
        row = []
        for km in int_kms:
            near = sp[np.abs(sp['km']-km) < 0.55]
            row.append(near[col].mean() if len(near) else np.nan)
        vals.append(row)
    if not vals: return int_kms, [np.nan]*len(int_kms)
    arr = np.nanmean(vals, axis=0)
    return int_kms, arr.tolist()

ht_runners  = [n for n in GPS_RUNNERS if df_clean.loc[df_clean['name']==n,'trained_hills'].values[0]==1]
nht_runners = [n for n in GPS_RUNNERS if df_clean.loc[df_clean['name']==n,'trained_hills'].values[0]==0]

kms, ht_pace  = avg_series(ht_runners,  'pace')
_,   nht_pace = avg_series(nht_runners, 'pace')
_,   ht_hr    = avg_series(ht_runners,  'hr')
_,   nht_hr   = avg_series(nht_runners, 'hr')

fig_m2i = make_subplots(specs=[[{'secondary_y': True}]])

fig_m2i.add_trace(go.Scatter(x=kms, y=ht_pace,  mode='lines+markers', name='Pace – Hill-trained',
    line=dict(color='#00C8FF',width=2), marker=dict(size=5),
    hovertemplate='km %{x}: %{y:.2f} min/km<extra>Pace hill-trained</extra>'), secondary_y=False)
fig_m2i.add_trace(go.Scatter(x=kms, y=nht_pace, mode='lines+markers', name='Pace – Not trained',
    line=dict(color='#00C8FF',width=2,dash='dot'), marker=dict(size=5),
    hovertemplate='km %{x}: %{y:.2f} min/km<extra>Pace not-trained</extra>'), secondary_y=False)
fig_m2i.add_trace(go.Scatter(x=kms, y=ht_hr,  mode='lines+markers', name='HR – Hill-trained',
    line=dict(color='#FF3366',width=2), marker=dict(size=5),
    hovertemplate='km %{x}: %{y:.0f} bpm<extra>HR hill-trained</extra>'), secondary_y=True)
fig_m2i.add_trace(go.Scatter(x=kms, y=nht_hr, mode='lines+markers', name='HR – Not trained',
    line=dict(color='#FF3366',width=2,dash='dot'), marker=dict(size=5),
    hovertemplate='km %{x}: %{y:.0f} bpm<extra>HR not-trained</extra>'), secondary_y=True)

# Climb overlay
fig_m2i.add_vrect(x0=13, x1=16, fillcolor='rgba(255,200,0,0.12)',
                  line_color='rgba(255,200,0,0.5)', line_dash='dash',
                  annotation_text='Killer Climb ⛰', annotation_position='bottom right')

fig_m2i.update_layout(
    template='plotly_dark', height=440,
    title='Myth 2: HR and Pace at the Killer Climb<br>'
          '<sub>Solid = hill-trained; dotted = not trained. Km 13–16 = climb zone.</sub>',
    xaxis_title='km', legend=dict(x=0.01, y=0.99),
)
fig_m2i.update_yaxes(title_text='Pace (min/km)', secondary_y=False)
fig_m2i.update_yaxes(title_text='Heart Rate (bpm)', secondary_y=True)
fig_m2i.show()

# ---- Myth 2 - Interactive Plotly: Pace Drop at the Climb (per runner) ----
import plotly.graph_objects as go

# For each GPS runner compute baseline pace (km 1-12) and climb pace (km 13-16)
pace_drops = []
for name in GPS_RUNNERS:
    g = SURVEY_TO_GPX.get(name)
    if g not in gpx: continue
    sp = gpx[g]['splits']
    if sp is None or sp.empty: continue
    base  = sp[sp['km'].between(1, 12)]['pace'].mean()
    climb = sp[sp['km'].between(13, 16)]['pace'].mean()
    if np.isnan(base) or np.isnan(climb): continue
    hill  = int(df_clean.loc[df_clean['name']==name, 'trained_hills'].values[0])
    pace_drops.append({'name': name, 'drop': climb - base, 'hill': hill})

pace_drops.sort(key=lambda x: x['drop'])

names  = [d['name'].split()[0] for d in pace_drops]
drops  = [d['drop'] for d in pace_drops]
colors = ['#00C8FF' if d['hill'] else '#FF6B8A' for d in pace_drops]

fig_m2r = go.Figure(go.Bar(
    x=drops, y=names, orientation='h',
    marker_color=colors,
    text=[f"+{v:.2f}" if v >= 0 else f"{v:.2f}" for v in drops],
    textposition='outside',
    hovertemplate='%{y}: %{x:+.3f} min/km vs baseline<extra></extra>'
))

fig_m2r.add_vline(x=0, line_color='#E8FF00', line_dash='dash', line_width=1.5)

fig_m2r.update_layout(
    title='Pace Drop at the Climb vs Flat Baseline (km 1–12)',
    xaxis_title='Pace change (min/km), positive = slowed down',
    yaxis_title='Runner',
    plot_bgcolor='#1a1a1a', paper_bgcolor='#111',
    font_color='#ccc',
    xaxis=dict(gridcolor='#333', zerolinecolor='#555'),
    yaxis=dict(gridcolor='#333'),
    annotations=[dict(x=0.98, y=1.04, xref='paper', yref='paper',
                      text='<span style="color:#00C8FF">■ Hill-trained</span>  '
                           '<span style="color:#FF6B8A">■ Not hill-trained</span>',
                      showarrow=False, font_size=12, align='right')],
    height=420, margin=dict(l=80, r=60, t=60, b=50)
)
fig_m2r.show()

# ---- Bootstrap 95% CI - HR spike, hill-trained vs not (10 000 resamples) ----
rng_bs2 = np.random.default_rng(43)
for label, mask in [('Hill-trained (n=4)', gps['trained_hills']==1),
                     ('Not hill-trained (n=17)', gps['trained_hills']==0)]:
    vals = gps.loc[mask, 'hr_spike'].dropna().values
    if len(vals) == 0: continue
    bs   = rng_bs2.choice(vals, size=(10_000, len(vals)), replace=True).mean(axis=1)
    lo, hi = np.percentile(bs, [2.5, 97.5])
    print(f'{label:28s} mean HR spike = {vals.mean():.1f} bpm   95% CI [{lo:.1f}, {hi:.1f}]')
print()
print('Wide CIs, especially for the hill-trained group (n=4). Direction supports Billat et al. (2003).')
# ---- Cohen's d - HR spike, hill-trained vs not ----
ht_vals  = gps.loc[gps['trained_hills']==1, 'hr_spike'].dropna().values
nht_vals = gps.loc[gps['trained_hills']==0, 'hr_spike'].dropna().values
if len(ht_vals) > 1 and len(nht_vals) > 1:
    pool2 = np.sqrt(((len(ht_vals)-1)*ht_vals.std()**2 + (len(nht_vals)-1)*nht_vals.std()**2)
                    / (len(ht_vals)+len(nht_vals)-2))
    d_m2 = (ht_vals.mean() - nht_vals.mean()) / pool2
    tag2  = 'small' if abs(d_m2)<0.5 else 'medium' if abs(d_m2)<0.8 else 'large'
    print(f"Cohen's d (hill-trained vs not, HR spike): {d_m2:.2f}  ({tag2} effect)")
    print(f"Hill-trained group n={len(ht_vals)} - treat this as exploratory, not confirmatory.")

Hill-trained (n=4)           mean HR spike = 6.4 bpm   95% CI [3.7, 8.9]
Not hill-trained (n=17)      mean HR spike = 8.0 bpm   95% CI [5.8, 10.6]

Wide CIs, especially for the hill-trained group (n=4). Direction supports Billat et al. (2003).
Cohen's d (hill-trained vs not, HR spike): -0.32  (small effect)
Hill-trained group n=4 — treat this as exploratory, not confirmatory.

vol = df_clean.dropna(subset=['km_per_week','finish_min']).copy()

fig, axes = plt.subplots(1, 2, figsize=(13, 4.5))

# (a) linear scale (mirrors the website chart)
for arch in ARCH_ORDER:
    sub = vol[vol['archetype']==arch]
    axes[0].scatter(sub['km_per_week'], sub['finish_min'], color=ARCH_COLORS[arch], s=85,
                    alpha=.85, edgecolor='#333', lw=.4, label=arch, zorder=3)
m, b = np.polyfit(vol['km_per_week'], vol['finish_min'], 1)
xs   = np.linspace(vol['km_per_week'].min(), vol['km_per_week'].max(), 100)
axes[0].plot(xs, m*xs+b, color='#888', lw=1.5, ls='--')
r_lin = np.corrcoef(vol['km_per_week'], vol['finish_min'])[0,1]
axes[0].set_title(f'Linear scale  (r = {r_lin:.2f})', fontweight='bold')
axes[0].set_xlabel('km / week'); axes[0].set_ylabel('Finish (min)')
axes[0].legend(fontsize=8); axes[0].grid(alpha=.3)

# (b) semi-log x (Week 4 check)
logx = np.log10(vol['km_per_week'])
for arch in ARCH_ORDER:
    sub = vol[vol['archetype']==arch]
    axes[1].scatter(np.log10(sub['km_per_week']), sub['finish_min'], color=ARCH_COLORS[arch], s=85,
                    alpha=.85, edgecolor='#333', lw=.4, label=arch, zorder=3)
m2, b2 = np.polyfit(logx, vol['finish_min'], 1)
xs2 = np.linspace(logx.min(), logx.max(), 100)
axes[1].plot(xs2, m2*xs2+b2, color='#888', lw=1.5, ls='--')
r_log = np.corrcoef(logx, vol['finish_min'])[0,1]
axes[1].set_title(f'Semi-log x  (r = {r_log:.2f})', fontweight='bold')
axes[1].set_xlabel('log₁₀(km / week)'); axes[1].set_ylabel('Finish (min)')
axes[1].grid(alpha=.3)

plt.suptitle('Myth 3: Training volume vs finish time', fontweight='bold', y=1.02)
plt.tight_layout(); plt.show()

# ---- Myth 3 - Interactive Plotly scatter: volume vs finish time (Week 6) ----
m3_df  = df_clean.dropna(subset=['km_per_week', 'finish_min']).copy()
_r_m3  = np.corrcoef(m3_df['km_per_week'], m3_df['finish_min'])[0, 1]

fig_m3 = px.scatter(
    m3_df, x='km_per_week', y='finish_min',
    color='archetype', color_discrete_map=ARCH_COLORS,
    hover_name='name',
    hover_data={'km_per_week': ':.0f', 'finish_min': ':.1f',
                'training_weeks': True, 'archetype': False},
    trendline='ols',
    labels={'km_per_week': 'km / week', 'finish_min': 'Finish time (min)',
            'archetype': 'Archetype', 'training_weeks': 'Weeks trained'},
    title=f'Myth 3: Training volume vs finish time  (Pearson r = {_r_m3:.2f})',
    template='plotly_dark', height=420,
)
fig_m3.update_traces(marker=dict(size=11, line=dict(width=1, color='#333')))
fig_m3.write_html(OUT_DIR / 'fig_myth3_interactive.html', include_plotlyjs='cdn')
fig_m3.show()

def split_label(r):
    if pd.isna(r): return 'Unknown'
    if r < 0.97:   return 'Negative'
    if r < 1.03:   return 'Even'
    return 'Positive'
df_clean['split_strategy'] = df_clean['split_ratio'].apply(split_label)
gps['split_strategy']      = gps['split_ratio'].apply(split_label)

int_yes = df_clean[df_clean['trains_intervals']==1]
int_no  = df_clean[df_clean['trains_intervals']==0]

fig, axes = plt.subplots(1, 2, figsize=(13, 4.5))

# (a) overlapping semi-transparent histograms (DAOST Ch.2 - preferred for two continuous distributions)
bins = np.arange(70, 135, 5)
axes[0].hist(int_yes['finish_min'], bins=bins, color=EXP, alpha=.6,
             edgecolor='#333', label=f'Intervals (n={len(int_yes)})')
axes[0].hist(int_no['finish_min'],  bins=bins, color=BEL, alpha=.6,
             edgecolor='#333', label=f'No intervals (n={len(int_no)})')
axes[0].axvline(int_yes['finish_min'].mean(), color=EXP, ls='--', lw=1.5)
axes[0].axvline(int_no['finish_min'].mean(),  color=BEL, ls='--', lw=1.5)
axes[0].set_xlabel('Finish time (min)'); axes[0].set_ylabel('Runners')
axes[0].set_title('Finish-time distribution by interval training', fontweight='bold')
axes[0].legend(); axes[0].grid(alpha=.3, axis='y')

# (b) split-strategy grouped bar - only meaningful for the GPS subset
strat_order = ['Negative','Even','Positive']
yes_counts = [(gps[gps.trains_intervals==1]['split_strategy']==s).sum() for s in strat_order]
no_counts  = [(gps[gps.trains_intervals==0]['split_strategy']==s).sum() for s in strat_order]
x  = np.arange(len(strat_order)); w = 0.4
axes[1].bar(x-w/2, yes_counts, w, color=EXP, alpha=.85, edgecolor='#333', label='Intervals')
axes[1].bar(x+w/2, no_counts,  w, color=BEL, alpha=.85, edgecolor='#333', label='No intervals')
axes[1].set_xticks(x); axes[1].set_xticklabels([f'{s} split' for s in strat_order])
axes[1].set_ylabel('GPS runners'); axes[1].legend(); axes[1].grid(alpha=.3, axis='y')
axes[1].set_title('Split strategy (GPS sub-sample only)', fontweight='bold')

plt.suptitle('Myth 4: Interval training and race strategy', fontweight='bold', y=1.02)
plt.tight_layout(); plt.show()

print(f'Mean finish - intervals:    {int_yes["finish_min"].mean():.1f} min')
print(f'Mean finish - no intervals: {int_no["finish_min"].mean():.1f} min')
print(f'Difference                : {int_no["finish_min"].mean() - int_yes["finish_min"].mean():.1f} min')

Mean finish — intervals:    100.4 min
Mean finish — no intervals: 118.6 min
Difference                : 18.2 min

# ---- Myth 4 - Interactive Plotly: finish distribution + split strategy ----
import plotly.graph_objects as go
from plotly.subplots import make_subplots

m4_full = df_clean.dropna(subset=['trains_intervals','finish_min','split_ratio']).copy()
int_yes = m4_full[m4_full['trains_intervals']==1]
int_no  = m4_full[m4_full['trains_intervals']==0]

def classify(r):
    if pd.isna(r): return 'Unknown'
    return 'Negative' if r<0.97 else ('Even' if r<1.03 else 'Positive')

strats = ['Negative','Even','Positive']
strat_cols = {'Negative':'#00C8FF','Even':'#AAFF00','Positive':'#FF3366'}

fig_m4i = make_subplots(rows=1, cols=2,
    subplot_titles=('Finish Time: Intervals vs No Intervals','Split Strategy Distribution'))

bins = list(range(70, 145, 5))
fig_m4i.add_trace(go.Histogram(
    x=int_yes['finish_min'], name='Intervals', xbins=dict(start=70,end=140,size=5),
    marker_color='rgba(0,200,255,0.53)', hovertemplate='%{x}–%{x}+5 min: %{y}<extra>Intervals</extra>'),
    row=1, col=1)
fig_m4i.add_trace(go.Histogram(
    x=int_no['finish_min'], name='No intervals', xbins=dict(start=70,end=140,size=5),
    marker_color='rgba(170,255,0,0.53)', hovertemplate='%{x}–%{x}+5 min: %{y}<extra>No intervals</extra>'),
    row=1, col=1)
# Vertical mean lines
fig_m4i.add_vline(x=int_yes['finish_min'].mean(), line_dash='dash', line_color='#00C8FF',
                  annotation_text=f'{int_yes["finish_min"].mean():.0f} min', row=1, col=1)
fig_m4i.add_vline(x=int_no['finish_min'].mean(),  line_dash='dash', line_color='#AAFF00',
                  annotation_text=f'{int_no["finish_min"].mean():.0f} min',  row=1, col=1)

for label, grp, col, colA in [('Intervals', int_yes, '#00C8FF', 'rgba(0,200,255,0.6)'), ('No intervals', int_no, '#AAFF00', 'rgba(170,255,0,0.6)')]:
    counts = [sum(classify(r)==s for r in grp['split_ratio']) for s in strats]
    fig_m4i.add_trace(go.Bar(
        x=strats, y=counts, name=label, marker_color=colA,
        hovertemplate='%{x}: %{y} runners<extra>' + label + '</extra>',
        showlegend=False),
        row=1, col=2)

fig_m4i.update_layout(
    template='plotly_dark', height=400, barmode='overlay',
    title='Myth 4: Interval Training and Race Strategy',
)
fig_m4i.update_xaxes(title_text='Finish time (min)', row=1, col=1)
fig_m4i.update_xaxes(title_text='Split strategy', row=1, col=2)
fig_m4i.update_yaxes(title_text='Runners', row=1, col=1)
fig_m4i.update_yaxes(title_text='Runners', row=1, col=2)
fig_m4i.show()

# ---- Myth 4 - Confound visualisation: intervals vs experience + volume ----
conf_df = df_clean.dropna(subset=['km_per_week', 'experience_level', 'finish_min']).copy()

fig_m4 = go.Figure()
for flag, label, col, sym in [(1, 'Intervals: Yes', '#00C8FF', 'circle'),
                               (0, 'Intervals: No',  '#AAFF00', 'diamond')]:
    sub = conf_df[conf_df['trains_intervals'] == flag]
    exp_label = sub['experience_level'].map({0: 'First HM', 1: '1–3 races', 2: '4+ races'}).fillna('?')
    fig_m4.add_trace(go.Scatter(
        x=sub['km_per_week'], y=sub['finish_min'],
        mode='markers', name=label,
        marker=dict(size=10 + sub['experience_level'].fillna(0) * 6,
                    color=col, symbol=sym, line=dict(width=1, color='#333')),
        text=(sub['name'] + '<br>km/week: ' + sub['km_per_week'].astype(str) +
              '<br>Experience: ' + exp_label +
              '<br>Finish: ' + sub['finish_min'].round(1).astype(str) + ' min'),
        hovertemplate='%{text}<extra></extra>',
    ))
fig_m4.update_layout(
    template='plotly_dark', height=440,
    title='Myth 4 Confound: interval trainers are also more experienced AND run more km/week<br>'
          '<sub>Marker size = experience level (bigger = more prior races). '
          'The interval group clusters top-left: fast, high-volume, experienced.</sub>',
    xaxis_title='km / week', yaxis_title='Finish time (min)',
    legend_title='Interval training',
    annotations=[dict(
        text="Simpson's Paradox risk: experience + volume explain much of the gap attributed to intervals.",
        xref='paper', yref='paper', x=0.01, y=0.02, showarrow=False,
        font=dict(size=10, color='#888'), align='left',
    )],
)
fig_m4.write_html(OUT_DIR / 'fig_myth4_confound.html', include_plotlyjs='cdn')
fig_m4.show()

# ---- Myth 4 - Partial correlation: intervals vs finish controlling for volume + experience ----
from scipy import stats as _stats

m4 = df_clean.dropna(subset=['trains_intervals','finish_min','km_per_week','experience_level']).copy()

def _resid(X_cols, y_vals, df):
    """OLS residuals of y ~ 1 + X_cols."""
    X = np.column_stack([np.ones(len(df))] + [df[c].values for c in X_cols])
    coef, *_ = np.linalg.lstsq(X, y_vals, rcond=None)
    return y_vals - X @ coef

controls = ['km_per_week', 'experience_level']
r_raw,     p_raw     = _stats.pearsonr(m4['trains_intervals'],            m4['finish_min'])
r_partial, p_partial = _stats.pearsonr(
    _resid(controls, m4['trains_intervals'].values.astype(float), m4),
    _resid(controls, m4['finish_min'].values,                     m4),
)

pct_drop = (1 - abs(r_partial) / abs(r_raw)) * 100 if r_raw != 0 else 0
print(f"Raw Pearson r (intervals vs finish_min)                              : r = {r_raw:.3f},  p = {p_raw:.3f}")
print(f"Partial r    (intervals vs finish_min | km_per_week, experience_lvl) : r = {r_partial:.3f},  p = {p_partial:.3f}")
print(f"Effect shrinks by {pct_drop:.0f}% when controlling for the two main confounds.")
print()
if p_partial < 0.05:
    print("Partial r remains significant - intervals carry signal beyond volume + experience.")
else:
    print("Partial r is not significant - we cannot rule out that the raw gap is fully explained by confounds.")

Raw Pearson r (intervals vs finish_min)                              : r = -0.585,  p = 0.002
Partial r    (intervals vs finish_min | km_per_week, experience_lvl) : r = -0.436,  p = 0.029
Effect shrinks by 25% when controlling for the two main confounds.

Partial r remains significant — intervals carry signal beyond volume + experience.

# ---- Copenhagen HM comparison - interactive histogram ----
import plotly.graph_objects as go

# Representative Copenhagen HM data (embedded from website D object)
CPH_MEAN, CPH_STD, CPH_MED = 115.4, 23.8, 115.3
CPH_P25, CPH_P75 = 97.5, 130.9
CPH_HIST = [
    (70,21),(75,28),(80,42),(85,58),(90,71),(95,84),(100,91),(105,89),
    (110,76),(115,64),(120,52),(125,38),(130,28),(135,19),(140,14),(145,9),(150,7),
]
our_hist = [
    (70,0),(75,0),(80,1),(85,1),(90,2),(95,2),(100,1),(105,2),
    (110,4),(115,4),(120,3),(125,2),(130,1),(135,2),(140,0),(145,0),(150,0),
]

# Scale our group to CPH total for visual comparison
cph_tot = sum(c for _,c in CPH_HIST)
our_tot = sum(c for _,c in our_hist)
scale = cph_tot / our_tot

fig_cph = go.Figure()
fig_cph.add_trace(go.Bar(
    x=[b for b,_ in CPH_HIST], y=[c for _,c in CPH_HIST],
    name='Copenhagen HM', marker_color='rgba(85,85,85,0.67)',
    hovertemplate='%{x}–%{x}+5 min: %{y} runners<extra>CPH</extra>',
))
fig_cph.add_trace(go.Bar(
    x=[b for b,_ in our_hist], y=[c*scale for _,c in our_hist],
    name='Our Group (scaled)', marker_color='rgba(232,255,0,0.8)',
    hovertemplate='%{x}–%{x}+5 min (scaled)<extra>Our group</extra>',
))
# Median lines
fig_cph.add_vline(x=CPH_MED, line_dash='dash', line_color='#888', annotation_text=f'CPH median {CPH_MED:.0f} min')
our_med = float(df_clean['finish_min'].median())
fig_cph.add_vline(x=our_med, line_dash='dash', line_color='#E8FF00', annotation_text=f'Our median {our_med:.0f} min')

fig_cph.update_layout(
    template='plotly_dark', barmode='overlay', height=400, bargap=0.05,
    title='Copenhagen HM vs Our Group: Finish Time Distribution<br>'
          '<sub>Our group scaled to CPH total. Both medians marked. '
          'Note: CPH is open-age; our cohort is 23–27 years old.</sub>',
    xaxis_title='Finish time (min)', yaxis_title='Runners',
    legend=dict(x=0.75, y=0.95),
)
fig_cph.show()
print(f"Our median: {our_med:.1f} min  |  CPH median: {CPH_MED:.0f} min")
print(f"We are {CPH_MED - our_med:.1f} min faster than the CPH median.")
print(f"CPH IQR: {CPH_P25}–{CPH_P75} min  |  Our range: {df_clean['finish_min'].min():.0f}–{df_clean['finish_min'].max():.0f} min")

Our median: 108.4 min  |  CPH median: 115 min
We are 6.9 min faster than the CPH median.
CPH IQR: 97.5–130.9 min  |  Our range: 78–135 min

#	Myth	Source we benchmark against
1	Experienced runners pace themselves better	Haney & Mercer (2011)
2	Hill training helps on a hilly course	Billat et al. (2003)
3	More training volume always means faster times	Sato et al. (2015)
4	Interval training leads to smarter race strategy	Helgerud et al. (2007)

Source	Records	Format	What's in it
Pre-race survey (Google Forms)	22 runners	`.xlsx`	name, age, weight, weeks trained, km/week, training type, prior half-marathon PR, target time, confidence (1–5), injuries, main worry, pacing strategy, finish time
Race-day GPS tracks	25 runners	`.gpx`	trackpoints every ~1 s: latitude, longitude, elevation, time, heart rate (Garmin/COROS extension)

	age	km_per_week	training_weeks	target_min	finish_min	split_ratio
count	25.00	25.00	25.00	20.00	25.00	25.00
mean	23.52	23.76	9.44	106.90	106.92	1.01
std	1.50	13.05	4.05	13.42	15.24	0.08
min	22.00	7.00	1.00	80.00	78.22	0.95
25%	23.00	15.00	8.00	103.75	98.57	0.97
50%	23.00	25.00	11.00	106.00	108.40	1.00
75%	24.00	25.00	11.00	113.00	115.43	1.00
max	28.00	52.00	18.00	135.00	135.27	1.29

Category	Tool we used	Where on the site	Why we picked it
Visual Structuring	Consistent visual platform	Every section: same palette (cyan / pink / lime / yellow), same Bebas Neue display font, same dark background	Reduces cognitive switching cost; a reader knows immediately which archetype a chart refers to without having to re-read a legend
Visual Structuring	Establishing shot / splash screen	Hero panel with the giant TESTING RUNNING MYTHS title and the four headline numbers (25 / 21.1 / avg / best)	Sets up the question and the scale in one screen; the reader knows what they're looking at before they scroll
Visual Structuring	Progress bar / "you are here"	Sticky top nav with section anchors (Runners · Predictions · Course · Myth 1–4 · Findings · vs CPH)	Long pages confuse readers; the nav doubles as a table of contents and a progress indicator
Highlighting	Feature distinction (colour)	Three archetype colours reused on every chart, the climb shaded yellow on every elevation/pace chart	Lets the reader compare across charts without re-orienting
Highlighting	Close-ups	Runner modal popup (click a name → full profile card)	Surfaces detail on demand without polluting the overview
Highlighting	Annotation	Verdict badges (Supported / Partially Supported), per-myth Finding boxes, the Killer Climb label on the elevation chart	Author commentary, in plain language, anchored to the visual
Transition Guidance	Familiar objects	A real Leaflet map of Lyngby on the Course panel; readers instantly recognise it as "a map"	No legend needed for the map itself; the reader's existing schema does the work
Transition Guidance	Animated transitions	Smooth re-colouring of the route when the reader selects a different runner	Preserves object permanence; same polyline, different colour story
Transition Guidance	Continuity editing	Same y-axis units (min/km), same colour palette across the four myth panels	The four panels feel like four chapters, not four standalone graphics

Category	Tool we used	Where on the site	Why we picked it
Ordering	Linear (default scroll path)	The whole page reads top-to-bottom: motivation → people → course → myths → verdict	Matches the structure of a research argument: set up the question, show the data, present each test, conclude
Ordering	User-directed path (in-section)	Runner-selector buttons on the Course and Myth 1 panels let the reader pick any subject	We can't pre-empt which runner each reader cares about most
Interactivity	Hover highlighting / details on demand	Tooltips on every chart; runner modal popup	Shneiderman's mantra (Week 6): overview first, then zoom, then details on demand
Interactivity	Filtering / selection / search	Click an archetype to filter; toggle runners on the pace chart	Reader can isolate the subgroup they care about without us pre-building 25 sub-charts
Interactivity	Very limited interactivity (on purpose)	The Hero panel and the Findings panel are deliberately static	Keeps the headline statement and the conclusions free of distraction; exactly where Segel & Heer say author-driven framing matters most
Messaging	Captions / Headlines	One short headline per panel ("Three kinds of runners", "The Course", "Predicted vs Actual")	A reader skimming should still get the spine of the story from the headlines alone
Messaging	Annotations + Finding boxes	Every myth panel has a `<div class="finding-box">` summarising the verdict in 2–3 sentences	Translates the chart into prose for readers who don't want to read the chart in detail
Messaging	Accompanying article	This notebook	The site is the story; the notebook is the working paper. Different audiences, same data

Testing Running Myths: Lyngby Half Marathon 2026¶

1. Motivation¶

2. The Dataset¶

2.1 What is our dataset?¶

2.2 Why this dataset?¶

2.3 Goal for the end-user experience¶

3. Data Cleaning & Preprocessing¶

3.1 Visual identity¶

3.2 Cleaning the survey¶

3.3 Parsing the GPX files¶

3.4 Merging survey + GPX¶

4. Basic Statistics¶

5. Exploratory Data Analysis¶

6. Data Analysis¶

6.1 The Course: geospatial visualisation (Week 5)¶

6.2 Prediction model: OLS (Week 4 linear regression)¶

6.3 The four myth tests¶

Myth 1 · Experienced runners pace themselves better¶

Myth 2 · Hill training helps on a hilly course¶

Myth 3 · More training volume always means faster times¶

Myth 4 · Interval training leads to smarter race strategy¶

6b. Copenhagen Half Marathon Comparison¶

7. Genre & Narrative Visualisation (Segel & Heer 2010, Week 6 / Week 8)¶

7.1 Genre¶

7.2 Visual Narrative tools¶

7.3 Narrative Structure tools¶

8. Visualisations: choices and justification¶

9. Discussion¶

What went well¶

What's still missing / what we'd improve¶

10. Contributions¶

11. References¶

Running literature¶

Data visualisation theory¶

Course materials¶

	name	gender	age	km_per_week	training_weeks	trained_hills	trains_intervals	target_min	finish_min_survey
0	Cristina	F	23	15	11	1	1	113.0	112.883333
1	Unai	M	23	15	14	0	0	105.0	104.733333
2	Oierga	M	23	37	11	0	1	105.0	95.716667
3	Oriol	M	23	25	8	0	1	83.0	81.616667
4	Álvaro	M	24	25	8	0	1	100.0	99.916667
5	Jose	M	22	15	11	0	0	110.0	108.950000
6	Maria	F	22	25	11	1	1	105.0	104.366667
7	Carlos Sainz	M	23	15	8	0	0	135.0	135.366667
8	Jon	M	23	52	18	0	1	105.0	104.366667
9	Lucia	F	22	7	2	0	1	120.0	119.050000

	name	gender	age	archetype	experience_level	km_per_week	training_weeks	trained_hills	trains_intervals	target_min	finish_min	first_half_pace	last_half_pace	split_ratio	has_gps
0	Célien	M	23	The Experienced	1	52	2	0	1	NaN	78.216667	3.736667	3.583333	0.958965	True
1	Pablo Bauri	M	26	The Experienced	1	25	8	1	1	80.0	79.950000	3.693333	3.692424	0.999754	True
2	Oriol	M	23	The Experienced	2	25	8	0	1	83.0	81.716667	3.895000	3.786364	0.972109	True
3	Marcus Henriksen	M	28	The Experienced	2	52	14	0	1	88.0	86.783333	4.100000	4.027273	0.982262	True
4	Oierga	M	23	The Grinders	0	37	11	0	1	105.0	95.416667	4.475000	4.445455	0.993398	True
5	Théophile	M	22	The Grinders	0	25	11	0	1	NaN	95.833333	4.491667	4.471212	0.995446	True
6	Eloi Colprim	M	23	The Grinders	1	15	11	0	1	97.0	98.566667	4.561667	4.680303	1.026007	True
7	Álvaro	M	24	The Experienced	1	25	8	0	1	100.0	100.016667	4.795000	4.563636	0.951749	True

Chart	Library	Why this chart?	Lecture reference
Histogram + KDE overlay (finish times)	`matplotlib` + `scipy.stats.gaussian_kde`	DAOST Ch. 2 argues KDE is more honest than histograms at small n because it removes the arbitrary bin-edge artefact; overlaying both lets the reader see the raw count and the smooth shape	Week 3 (DAOST Ch. 2)
Conditional KDE per archetype	`matplotlib` + `scipy.stats.gaussian_kde`	Direct visual analogue of the Week-3 P(crime\|district) idea; three filled KDEs on shared axes lets the reader compare centres and spreads at once	Week 3
Pairwise scatter (training vars vs finish)	`matplotlib`	Week 4's first step before any regression; see which variables look linear and where the outliers are	Week 4, DAOST Ch. 3
Folium map of the course	`folium` (Leaflet)	A geographic story needs a map; the same dark CartoDB tiles match the website palette so context switching is minimal	Week 5
Elevation profile coloured by pace	`matplotlib`	1-D abstraction of the map; encodes pace as colour (highest-accuracy ranking on a position axis, medium-accuracy on hue, Week 4)	Week 4 + Week 5
Predicted vs Actual (interactive)	`plotly.graph_objects`	Web-ready interactive scatter; hover tooltips give details on demand	Week 6
Split-ratio dot-plot (Myth 1)	`matplotlib`	Boxplots hide individuals; dot-plots show all 7 GPS runners and the archetype grouping; Week 3 warning about losing individual signal in summary stats	Week 3 (DAOST Ch. 2, fig 2-1)
Grouped bar + line (Myth 2)	`matplotlib`	Two parallel comparisons (HR change, pace change) live on the same panel; grouped bars are appropriate for two-group categorical comparison	Week 2 (chart-choice rules)
Linear vs semi-log scatter (Myth 3)	`matplotlib`	Week 4 exercise 2.2; test whether the relationship is exponential before committing to a linear fit	Week 4
Overlapping histograms (Myth 4)	`matplotlib`	DAOST Ch. 2: when comparing two continuous distributions, semi-transparent overlay reveals overlap better than side-by-side bars	Week 3 (DAOST Ch. 2)

Team member	Lead responsibilities
Marta Arana	Data collection (survey design in Spanish/English, runner coordination, race-day photos), GPX parsing pipeline, Myth 2 (hill training) analysis, overall visual identity & website design, project management
Esben Kok	Exploratory data analysis, OLS prediction model + Predicted vs Actual chart, Myth 3 (training volume), Segel & Heer mapping (Section 7), notebook structure
Sergi Lupon	Folium course map, elevation profile rendering, Myth 1 (pacing) and Myth 4 (intervals) analyses, interactive Plotly charts and HTML embeds, deployment to GitHub Pages