Everything

Quite a while ago my amazing wife asked me if it was possible to find the time of the first goal for the 2006 FIFA World Cup matches.  I was using R at the time and thought it was possible.  Here are the scripts I wrote to scrape the info from the FIFA website.  They're also posted on my github here.

There are three scripts, One scrapes the data and saves it as CSV files, the next does some processing and saves the results as csv's and the third produces some basic graphs.

Web Scraping Script

# Script to Scrape web pages to collect World 
# Script to Scrape web pages to collect World Cup 2006 score data
#
# Author: Paul Hurley
###############################################################################
require(ggplot2)
require(plyr)
require(stringr)
require(RCurl)
require(XML)
#' Function to Sort a dataframe with a given list of columns
#' Cribbed from Spector, P. (2008). "Data Manipulation with R", UseR! Springer. Pg78
#' @param df Dataframe to be sorted
#' @param ... list of columns to sort on
#' @returnType
#' @return A sorted dataframe
#' @author "Paul Hurley"
#' @
export
#' @
#' @usage with(dataframe,sortframe(dataframe,column1, column2, column3))
#' @examples with(iris,sortframe(iris,Sepal.Length,Sepal.Width,Petal.Length))
sortframe<-function(df,...){df[do.call(order,list(...)),]}
goals<-function(match) {
theURL <-paste("http://www.fifa.com/worldcup/archive/germany2006/results/matches/match=974100",match,"/report.html",sep="")
webpage = tryCatch(getURL(theURL, header=FALSE, verbose=TRUE),
HTTPError = function(e) {
cat("HTTP error: ", e$message, "\n")
})
message(paste("Webpage size is ",nchar(webpage),sep=""))
webpagecont <- readlines="" tc="" -="" textconnection="" webpage="" close="" fifa="" doc="" -htmlparse="" webpagecont="" xpathsapply="" div="" class="cont" xmlvalue="" goals="" scored="" grep="" value="TRUE)" -gsub="" strsplit="" 1="" -strsplit="" table="" -as="" data="" frame="" matrix="" unlist="" ncol="3,byrow=TRUE))" names="" -c="" player="NA,Team=NA,Time=NA)" team="" time="" message="" paste="" there="" were="" nrow="" sep="" if="" 0="" -data="" now="" get="" the="" match="" details="" contains="" teams="" and="" final="" score="" 2="" number="" date="" venue="" attendance="" 0-9="" a-za-z="" fullmatch="" -fifa="" tempfifa="" matchdatetimevenue="" stadiumattendance="" -substr="" nchar="" -2="" tempfifa2="" -unlist="" -paste="" substr="" 2006="" 1000="" :="" 0-5="" return="" groupa="" 01="" 02="" 17="" 18="" 33="" 34="" groupb="" 03="" 04="" 19="" 20="" 35="" 36="" groupc="" 05="" 06="" 21="" 22="" 37="" 38="" groupd="" 07="" 08="" 23="" 25="" 39="" 40="" groupe="" 09="" 10="" 26="" 41="" 42="" groupf="" 11="" 12="" 27="" 28="" 43="" 44="" groupg="" 13="" 14="" 29="" 30="" 45="" 46="" grouph="" 15="" 16="" 31="" 32="" 47="" 48="" round16="" 49="" 50="" 51="" 52="" 53="" 54="" 55="" quater="" 57="" 58="" 59="" 60="" semi="" 61="" 62="" 64="" wooden="" 63="" groupar="" -ldply="" groupbr="" groupcr="" groupdr="" grouper="" groupfr="" groupgr="" grouphr="" round16r="" quaterr="" semir="" finalr="" woodenr="" datadir="" home="" paul="" workspace="" world_cup="" write="" csv="" world="" cup="" -rbind="" worldcup2006="" pre="">

The processing script

# TODO: Add comment
#
# Author: paul
###############################################################################
require(ggplot2)
require(plyr)
require(stringr)
require(RCurl)
require(XML)
sortframe<-function(df,...){df[do.call(order,list(...)),]}
datadir<-"/home/paul/workspace/world_cup/data/"
world.cup.2006<-read.csv(paste(datadir, "worldcup2006.csv", sep=""))
world.cup.2006$Timen<-as.numeric(str_extract(as.character(world.cup.2006$Time)," [0-9]*"))
teamgoals<-ddply(subset(world.cup.2006,!is.na(Team)),.(Team),nrow)
top5<-subset(world.cup.2006,Team %in% (with(teamgoals,sortframe(teamgoals,-V1))$Team[1:5]))
top5$Team<-factor(top5$Team)
write.csv(top5, paste(datadir, "top5.csv", sep=""))
firstgoal<-ddply(world.cup.2006,.(match),function(df) {
with(df,sortframe(df,Timen))
return(df[1,])
})
write.csv(firstgoal,paste(datadir, "firstgoals.csv", sep=""))
top5firstgoal<-ddply(top5,.(match),function(df) {
with(df,sortframe(df,Timen))
return(df[1,])
})
write.csv(top5firstgoal,paste(datadir, "top5firstgoal", sep=""))

and the graphs

# TODO: Add comment
#
# Author: paul
###############################################################################
require(ggplot2)
require(plyr)
require(stringr)
datadir<-"/home/paul/workspace/world_cup/data/"
firstgoal<-read.csv(file=paste(datadir, "
firstgoals.csv", sep=""))
top5firstgoal<-read.csv(file=paste(datadir, "top5firstgoal.csv", sep=""))
top5<-read.csv(file=paste(datadir, "top5.csv", sep=""))
print(qplot(Timen,data=firstgoal, geom="histogram", binwidth=1))
qplot(Timen,data=firstgoal, geom="histogram", binwidth=5)
qplot(Timen,data=firstgoal, geom="histogram", binwidth=10)
qplot(factor(Team), Timen, data=firstgoal, geom="boxplot")+geom_jitter()
qplot(factor(Team), Timen, data=world.cup.2006, geom="boxplot")+geom_jitter()
qplot(Timen,data=top5firstgoal, geom="histogram", binwidth=1)
qplot(Timen,data=top5firstgoal, geom="histogram", binwidth=5)
ggplot(top5firstgoal,aes(Timen, fill=Team))+geom_density(alpha=0.2)
qplot(factor(Team), Timen, data=top5firstgoal, geom="boxplot")+geom_jitter()
qplot(factor(Team), Timen, data=top5, geom="boxplot")+geom_jitter()

time2score byteam box

 

time2score hist

 

time2score by team

 

 

My amazing, awesome wife often comes up with the little puzzles for our amazing children, and this one seemed destined to be solved in R.  So, using up to 5 coins (1p, 2p, 5p, 10p, 20p and 50p) first she asked our kids whether they could make every value up to 50p, and then what the smallest value they couldn't make was.

Here's my R solution (which took about 5mins less than our daughter took
to answer the first question)

# What Amounts can't you make using up to 5 coins 1p to 50p
# 
# Author: Paul Hurley
library(ggplot2)
library(plyr)
# Define our coins
coins <- as="" factor="" c="" 0="" 1="" 2="" 5="" 10="" 20="" 50="" build="" a="" list="" of="" all="" the="" possibilities="" -="" expand="" grid="" coin1="" coin2="" coin3="" coin4="" coin5="" calculate="" result="" total="" numeric="" character="" define="" our="" target="" values="" targets="" 1:250="" what="" amounts="" aren="" t="" possible="" in="" 88="" 89="" 98="" 99="" 118="" 119="" 128="" 129=
"" 133="" 134="" 136="" 137="" 138="" 139="" 143="" 144="" 146="" 18="" 147="" 148="" 149="" 158="" 159="" 163="" 164="" 166="" 167="" 168="" 169="" 173="" 174="" 176="" 177="" 178="" 179="" 35="" 181="" 182="" 183="" 184="" 185="" 186="" 187="" 188="" 189="" 191="" 192="" 193="" 194="" 195="" 196="" 197="" 198="" 52="" 199="" 203="" 204="" 206="" 207="" 208="" 209="" 211="" 212="" 213="" 214="" 215="" 216="" 217="" 218="" 219="" 221="" 69="" 222="" 223="" 224="" 225="" 226="" 227="" 228="" 229="" 230="" 231="" 232="" 233="" 234="" 235="" 236="" 237="" 238="" 86="" 239="" 240="" 241="" 242="" 243="" 244="" 245="" 246="" 247="" 248="" 249="" pre="">


So, the smallest value we can't make is 88

We can even produce a table of the number of ways to make each number, and a graph

tableofpossibilities<-ddply(.data = possibilities, .(total), nrow)
ggplot(data = possibilities, aes(x = total)) + geom_histogram(binwidth = 1)

Giving this graph

unnamed-chunk-2


Then when I triumphantly told her, she asked, 'what about 4 coins ?'

# How about 4 coins build a list of all the possibilities
fourpossibilities <- expand="" grid="" coin1="" coin2="" coin3="" coin4="" calculate="" the="" result="" fourpossibilities="" total="" -="" as="" numeric="" character="" what="" values="" can="" t="" be="" made="" targets="" in="" 1="" 38="" 39="" 48="" 49="" 68="" 69="" 78="" 79="" 83="" 84="" 86="" 87="" 88="" 89="" 93="" 94="" 96="" 18="" 97="" 98="" 99="" 108="" 109="" 113="" 114="" 116="" 117="" 118="" 119="" 123="" 124="" 126="" 127="" 128="" 129="" 35="" 131="" 132="" 133="" 134="" 135="" 136="" 137="" 138="" 139="" 141="" 142="" 143="" 144="" 145="" 146="" 147="" 148="" 52="" 149="" 153="" 154="" 156="" 157="" 158="" 159="" 161="" 162="" 163="" 164="" 165="" 166="" 167="" 168="" 169="" 171="" 172="" 173="" 174="" 175="" 176="" 177="" 178="" 179="" 180="" 181="" 182="" 183="" 184="" 185="" 186="" 187="" 188="" 189="" 190="" 191="" 192="" 193="" 194="" 195="" 196="" 197="" 198="" 199="" 201="" 202="" 203="" 204="" 205="" 206="" 103="" 207="" 208="" 209="" 210="" 211="" 212="" 213="" 214="" 215="" 216="" 217="" 218="" 219="" 220="" 221="" 222="" 223="" 120="" 224="" 225="" 226="" 227="" 228="" 229="" 230="" 231="" 232="" 233="" 234="" 235="" 236="" 237="" 238="" 239="" 240="" 241="" 242="" 243="" 244="" 245="" 246="" 247="" 248="" 249="" 250="" pre="">
So, the smallest value of four coins is 38p

So, the smallest value of four coins is 38p

 

 

A while ago I had the need to produce some posters that included lots of data (scientific style).  Having recently got back into R and learning LaTex I googled for a way to do this using R.  Here's what I found and ended up with, using R, LaTex, Beamer and BeamerPoster.

You can pull my beamerposter template and an example Sweave Rnw file from my Github.

beamerposter image