• 1 Introduction
    • 1.1 Dirty dataset
    • 1.2 Source
  • 2 Setup
    • 2.1 Loading libraries
    • 2.2 Loading cleaned data
  • 3 Data
    • 3.1 Raw
    • 3.2 Clean data
  • 4 Questions
    • 4.1 Long Jump
    • 4.2 100m sprint
    • 4.3 Best all-rounder
    • 4.4 Shot-put
    • 4.5 400m sprint

1 Introduction

1.1 Dirty dataset

Decathlon data set comes from FactoMineR package and represents two competitions: Decastar and Olympic Games.

1.2 Source

Department of statistics and computer science, Agrocampus Rennes

2 Setup

2.1 Loading libraries

library(readr)
library(here)
library(janitor)
library(tidyverse)
library(data.table)

2.2 Loading cleaned data

decathlon <-
here("clean_data/clean_data.rds") %>% 
    read_rds()

3 Data

3.1 Raw

"raw_data/decathlon.rds" %>% 
    here() %>%
    read_rds() %>% 
    data.table()
ABCDEFGHIJ0123456789
100m
<dbl>
Long.jump
<dbl>
Shot.put
<dbl>
High.jump
<dbl>
400m
<dbl>
110m.hurdle
<dbl>
Discus
<dbl>
Pole.vault
<dbl>
11.047.5814.832.0749.8114.6943.755.02
10.767.4014.261.8649.3714.0550.724.92
11.027.3014.772.0448.3714.0948.954.92
11.027.2314.251.9248.9314.9940.875.32
11.347.0915.192.1050.4215.3146.264.72
11.117.6014.311.9848.6814.2341.104.92
11.137.3013.482.0148.6214.1745.674.42
10.837.3113.762.1349.9114.3844.414.42
11.646.8114.571.9550.1414.9347.604.92
11.377.5614.411.8651.1015.0644.994.82

3.2 Clean data

decathlon %>% 
    data.table()
ABCDEFGHIJ0123456789
competitor
<chr>
ranking
<int>
overall_competition_points
<int>
competition
<fctr>
event
<chr>
SEBRLE18217Decastar100m_sprint
SEBRLE18217Decastarlong_jump
SEBRLE18217Decastarshot_put
SEBRLE18217Decastarhigh_jump
SEBRLE18217Decastar400m_sprint
SEBRLE18217Decastar110m_hurdles
SEBRLE18217Decastardiscus
SEBRLE18217Decastarpole_vault
SEBRLE18217Decastarjavlin
SEBRLE18217Decastar1500m_race

4 Questions

4.1 Long Jump

Finding the longest long jump in the data

decathlon %>%
    group_by(event) %>% 
    filter(event == "long_jump",
           event_points == max(event_points)) %>% 
    select(-ranking:-overall_competition_points)
ABCDEFGHIJ0123456789
competitor
<chr>
competition
<fctr>
event
<chr>
event_points
<dbl>
ClayOlympicGlong_jump7.96

4.2 100m sprint

Finding the average 100m time for each competition

decathlon %>% 
        filter(event == "100m_sprint") %>%
    group_by(competition) %>%
        summarise(average_100m_time = round(mean(event_points), 2))
ABCDEFGHIJ0123456789
competition
<fctr>
average_100m_time
<dbl>
Decastar11.18
OlympicG10.92

4.3 Best all-rounder

Finding the competitor with the highest total points across both competitions

decathlon %>% 
    group_by(competitor) %>%
    summarise(total_competition_points = sum(overall_competition_points)) %>% 
    filter(total_competition_points == max(total_competition_points)) %>% 
    head(3)
## `summarise()` ungrouping output (override with `.groups` argument)
ABCDEFGHIJ0123456789
competitor
<chr>
total_competition_points
<int>
Sebrle88930

4.4 Shot-put

Finding the shot-put scores for the top three competitors in each competition

decathlon %>% 
    select(-overall_competition_points, -ranking) %>% 
    filter(event == "shot_put")%>% 
    group_by(competition) %>%
    top_n(3, event_points) %>% 
    arrange(desc(event_points))
ABCDEFGHIJ0123456789
competitor
<chr>
competition
<fctr>
event
<chr>
event_points
<dbl>
SebrleOlympicGshot_put16.36
KarpovOlympicGshot_put15.93
MaceyOlympicGshot_put15.73
YURKOVDecastarshot_put15.19
SEBRLEDecastarshot_put14.83
KARPOVDecastarshot_put14.77

4.5 400m sprint

Calculating the average points for competitors who ran the 400m in less than 50 seconds vs. those that ran 400m in more than 50 seconds

decathlon %>% 
    filter(event == "400m_sprint")%>%
    group_by(event_points<50) %>% 
    summarise(average_points = round(mean(overall_competition_points))) %>%
  arrange(desc(average_points))
## `summarise()` ungrouping output (override with `.groups` argument)
ABCDEFGHIJ0123456789
event_points < 50
<lgl>
average_points
<dbl>
TRUE8120
FALSE7727