Visualizing ROC Curves in R using Plotly

(This article was first published on R – Modern Data, and kindly contributed to R-bloggers)

In this post we’ll create some simple functions to generate and chart a Receiver Operator (ROC) curve and visualize it using Plotly. See Carson’s plotly book for more details around changes in syntax.

We’ll do this from a credit risk perspective i.e. validating a bank’s internal rating model (we’ll create a sample dataset keeping this in mind)

We’ll replicate computations highlighted in this paper.

library(plotly)
library(dplyr)
library(flux)

Sample data

set.seed(123)
n <- 100000
lowest.rating <- 10

# Sample internal ratings
# Say we have a rating scale of 1 to 10̥
ratings <- sample(1:lowest.rating, size = n, replace = T)

# Defaults
# We'll randomly assign defaults concentrating more defaults 
# in the lower rating ranges. We'll do this by creating exponentially
# increasing PDs across the rating range

power <- 5
PD <- log(1:lowest.rating)
PD <- PD ^ power

#PD <- exp((1:lowest.rating))
PD <- PD/(max(PD) * 1.2)  # increased denominator to make the PDs more realistic
Now given PD for eac rating category sample from a binomial distribution
# to assign actual defaults
defaults <- rep(0, n)
k <- 1
for(i in ratings){
  defaults[k] <- rbinom(1, 1, PD[i])
  
  k <- k + 1
}

dataset <- data.frame(Rating = ratings,
                      Default = defaults)

# Check if dataset looks realistic̥
# df <- dataset %>% 
#   group_by(Rating) %>% 
#   summarize(Def = sum(Default == 1), nDef = sum(Default == 0))

ROC Curve Computation

Now that we have a sample dataset to work with we can start to create the ROC curve

ROCFunc <- function(cutoff, df){
  
  # Function counts the number of defaults hap̥pening in all the rating
  # buckets less than or equal to the cutoff
  
  # Number of hits = number of defaults with rating < cutoff / total defaults
  # Number of false alarms = number ofnon defaults with rating < cutoff / total non defaults

  nDefault <- sum(df$Default == 1)
  notDefault <- sum(df$Default == 0)

  temp <- df %>% filter(Rating >= cutoff)
  hits <- sum(temp$Default == 1)/nDefault
  falsealarm <- sum(temp$Default == 0)/notDefault
  ret <- matrix(c(hits, falsealarm), nrow = 1)
  colnames(ret) <- c("Hits", "Falsealarm")

  return(ret)
}

# Arrange ratings in decreasing order
# A lower rating is better than a higher rating
vec <- sort(unique(ratings), decreasing = T)
ROC.df <- data.frame()

for(i in vec){
  ROC.df <- rbind(ROC.df, ROCFunc(i, dataset))
}

# Last row to complete polygon

labels <- data.frame(x = ROC.df$Falsealarm, 
                     y = ROC.df$Hits,
                     text = vec)

ROC.df <- rbind(c(0,0), ROC.df)

# Area under curve
AUC <- round(auc(ROC.df$Falsealarm, ROC.df$Hits),3)

Plot

plot_ly(ROC.df, y = ~Hits, x = ~Falsealarm, hoverinfo = "none") %>% 
  
  add_lines(name = "Model",
            line = list(shape = "spline", color = "#737373", width = 7), 
            fill = "tozeroy", fillcolor = "#2A3356") %>% 
  
  add_annotations(y = labels$y, x = labels$x, text = labels$text,
                  ax = 20, ay = 20,
                  arrowcolor = "white",
                  arrowhead = 3,
                  font = list(color = "white")) %>% 
  
  add_segments(x = 0, y = 0, xend = 1, yend = 1, 
               line = list(dash = "7px", color = "#F35B25", width = 4), 
               name = "Random") %>% 
  
  add_segments(x = 0, y = 0, xend = 0, yend = 1, 
               line = list(dash = "10px", color = "black", width = 4), 
               showlegend = F) %>%
  
  add_segments(x = 0, y = 1, xend = 1, yend = 1, 
               line = list(dash = "10px", color = "black", width = 4), 
               showlegend = F) %>% 
  
  add_annotations(x = 0.8, y = 0.2, showarrow = F, 
                  text = paste0("Area Under Curve: ", AUC),
                  font = list(family = "serif", size = 18, color = "#E8E2E2")) %>%
  
  add_annotations(x = 0, y = 1, showarrow = F, xanchor = "left", 
                  xref = "paper", yref = "paper",
                  text = paste0("Receiver Operator Curve"),
                  font = list(family = "arial", size = 30, color = "#595959")) %>%
  
  add_annotations(x = 0, y = 0.95, showarrow = F, xanchor = "left", 
                  xref = "paper", yref = "paper",
                  text = paste0("Charts the percentage of correctly identified defaults (hits) against the percentage of non defaults incorrectly identifed as defaults (false alarms)"),
                  font = list(family = "serif", size = 14, color = "#999999")) %>% 
  
   
  layout(xaxis = list(range = c(0,1), zeroline = F, showgrid = F,
                      title = "Number of False Alarms"),
         yaxis = list(range = c(0,1), zeroline = F, showgrid = F,
                      domain = c(0, 0.9),
                      title = "Number of Hits"),
         plot_bgcolor = "#E8E2E2",
         height = 800, width = 1024)