I'm trying to calculate bid-ask spreads based on daily high & low prices. The paper & algorithm for this was proposed by CORWIN & SCHULTZ (2012) (http://onlinelibrary.wiley.com/doi/10.1111/j.1540-6261.2012.01729.x/abstract).
This is their implementation in SAS: https://www3.nd.edu/~scorwin/HILOW_Estimator_Sample_002.sas
This is my translation into R:
# ----------------------------------- #
##### 2. CALCULATE BID-ASK-SPREAD #####
# ----------------------------------- #
##### 2.1 AGGREGATE TRADES BY DAY
# To calculate the bid-ask-spread, we need the following inputs:
# DATE = date in yyyymmdd format
# PRC = daily closing price
# LOPRC = daily low price
# HIPRC = daily high price
# import transactions data
transactions <- read.csv(file = "transaction_data_filtered.csv", sep = ",", header = T)
##### 2.1.1 Reformatting & Cleaning
transactions$DOLLAR_PRICE <- as.double(transactions$DOLLAR_PRICE)
transactions$PAR_TRADED <- as.numeric(transactions$PAR_TRADED)
# create a comparable representation of time to get closing price
transactions$SECONDS_SINCE_MIDNIGHT <- 0
for (i in 1:3) {
transactions$SECONDS_SINCE_MIDNIGHT <- transactions$SECONDS_SINCE_MIDNIGHT + as.integer(sapply(strsplit(as.character(transactions$TIME_OF_TRADE), ":"), "[", i)) * 60 ^ (3 - i)
}
# some NAs got introduced; values are corrupt --> drop
nas <- subset(transactions, is.na(transactions$SECONDS_SINCE_MIDNIGHT))
transactions <- subset(transactions, !is.na(transactions$SECONDS_SINCE_MIDNIGHT))
rm(nas, i)
# format date as proper date
transactions$DATE <- anytime(transactions$TRADE_DATE)
# add iq_issuer_ciqid to clearly identify trades from one entity
transactions$IQ_ISSUER_CIQID <- NULL
iq_issuer_ciqid <- read.csv(file = "output_data/cusips_and_ciqid.csv", header = T, sep = "," )
keeps <- c("cusip", "iq_issuer_ciqid")
iq_issuer_ciqid <- iq_issuer_ciqid[keeps]
colnames(iq_issuer_ciqid) <- c("CUSIP", "IQ_ISSUER_CIQID")
rm(keeps)
transactions <- merge(transactions, iq_issuer_ciqid, by = "CUSIP", all.x = T)
##### 2.1.2 Generate high, low and closing prices
# on a daily base; use data.table for higher efficiency
transactions_dt <- as.data.table(transactions)
transactions_daily_by_issuer <- transactions_dt[, .(HIPRC = max(DOLLAR_PRICE), LOPRC = min(DOLLAR_PRICE), PRC = max(ifelse(SECONDS_SINCE_MIDNIGHT == max(SECONDS_SINCE_MIDNIGHT), DOLLAR_PRICE, 0)), VOLUME = sum(PAR_TRADED)), by = .(IQ_ISSUER_CIQID, DATE)]
# investigate the number of trades to get a better feeling for the data
number_of_transactions <- as.data.frame(xtabs(~IQ_ISSUER_CIQID, transactions))
colnames(number_of_transactions) <- c("IQ_ISSUER_CIQID", "NUMBER_OF_TRADES")
summary(as.integer(number_of_transactions$NUMBER_OF_TRADES))
number_of_transactions <- number_of_transactions[order(number_of_transactions$NUMBER_OF_TRADES, decreasing = T),]
# replace all # of trades over 10.000 with 10.000 to get a meaningful histogram
number_of_transactions$NUMBER_OF_TRADES[number_of_transactions$NUMBER_OF_TRADES > 10000] <- 10000
hist(number_of_transactions$NUMBER_OF_TRADES, breaks = 20, col = "blue")
plot_density <- density(number_of_transactions$NUMBER_OF_TRADES)
plot(plot_density,, main = "Density of number of trades per IQ_ISSUER_CIQID")
polygon(plot_density, col = "blue")
# general problem: a lot of entities don't have much volume --> KEEP IN MIND!
write.csv(transactions_daily_by_issuer, "transaction_data.csv")
rm(transactions_daily_by_issuer, transactions, transactions_dt, number_of_transactions)
##### 2.2 Calc bid-ask-spread as liquidity proxy - based on CORWIN(2012) #####
# ------------------------------------------------------------------------- #
##### 2.2.1 Adjust/remove bad prices
##### 2.2.1.1 IF (LOPRC = HIPRC OR LOPRC <= 0 OR HIPRC <= 0 THEN DO: DROP BAD PRICES ONLY (USE BID / ASK ON ZERO VOLUME DAYS)
# import file to R and remove unnecessary column "X"
transaction_data <- read.csv("transaction_data.csv")
transaction_data$X <- NULL
# basically all bad prices are due to HIPRC == LOPRC --> indicates only one trade at that day, KEEP IN MIND!
bad_prices <- subset(transaction_data, (LOPRC == HIPRC | LOPRC <= 0 | HIPRC <= 0 | PRC <= 0 | VOLUME == 0))
# sorting the dataframe based on CompanyID and Date
sample <- transaction_data[order(transaction_data$IQ_ISSUER_CIQID, transaction_data$DATE),]
sample$LOPRCIN <- sample$LOPRC
sample$HIPRCIN <- sample$HIPRC
##### 2.2.1.2 RETAIN GOOD HIGH-LOW PRICES AND REPLACE IN CASES WHERE HIGH=LOW
sample$ISAMEPRC <- ifelse(sample$LOPRC == sample$HIPRC, 1, 0)
sample$INOTRADE <- ifelse(sample$PRC < 0 | sample$VOLUME == 0, 1, 0) # 0 results with "1"
sample$LOPRC[sample$PRC <= 0 | sample$VOLUME == 0] <- NA
sample$HIPRC[sample$PRC <= 0 | sample$VOLUME == 0] <- NA
sample_na <- subset(sample, is.na(sample$PRC)) # no NAs introduced
rm(sample_na)
sample$PRC <- abs(sample$PRC) # no effect either
sample$LOPRCR <- NA
sample$HIPRCR <- NA
sample[1, "LOPRCR"] <- sample[1, "LOPRC"]
sample[1, "HIPRCR"] <- sample[1, "HIPRC"]
for (i in 2:length(sample$IQ_ISSUER_CIQID)) {
if (sample[i, "LOPRC"] > 0 && sample[i, "LOPRC"] < sample[i, "HIPRC"]) {
sample$LOPRCR <- sample$LOPRC
sample$HIPRCR <- sample$HIPRC
}
else {
sample[i, "LOPRCR"] <- sample[i - 1, "LOPRCR"]
sample[i, "HIPRCR"] <- sample[i - 1, "HIPRCR"]
if (sample[i, "PRC"] >= sample[i, "LOPRC"] && sample[i, "PRC"] <= sample[i, "HIPRC"]) {
#REPLACE IF WITHIN PRIOR DAY'S RANGE
sample[i, "LOPRC"] <- sample[i - 1, "LOPRCR"]
sample[i, "HIPRC"] <- sample[i - 1, "HIPRCR"]
sample$HLRESET <- 1
}
if (sample[i, "PRC"] < sample[i - 1, "LOPRCR"]) {
#REPLACE IF BELOW PRIOR DAY'S RANGE
sample[i, "LOPRC"] <- sample[i, "PRC"]
sample[i, "HIPRC"] <- sample[i, "HIPRCR"] - (sample[i, "LOPRCR"] - sample[i, "PRC"])
sample$HLRESET <- 2
}
if (sample[i, "PRC"] > sample[i - 1, "HIPRCR"]) {
#REPLACE IF ABOVE PRIOR DAY'S RANGE
sample[i, "LOPRC"] <- sample[i - 1, "LOPRCR"] + (sample[i, "PRC"] - sample[i - 1, "HIPRCR"])
sample[i, "HIPRC"] <- sample[i, "PRC"]
sample$HLRESET <- 3
}
}
}
rm(i)
##### 2.2.1.3 ADJUST FOR OVERNIGHT RETURNS BASED ON LAGGED CLOSING PRICE
# THIPRC = today high price
# TLOPRC = today day low price
# LHIPRC = lag high price (yesterday)
# LLOPRC = lag low price (yesterday)
# LPRC = lag closing price (yesterday)
sample$RETADJ <- 0
#CURRENT DAY LOW PRICE
sample$TLOPRC <- sample$LOPRC
#CURRENT DAY HIGH PRICE
sample$THIPRC <- sample$HIPRC
#PRIOR DAY LOW PRICE
sample <- sample %>%
group_by(IQ_ISSUER_CIQID) %>%
mutate(LLOPRC = lag(LOPRC))
#PRIOR DAY HIGH PRICE
sample <- sample %>%
group_by(IQ_ISSUER_CIQID) %>%
mutate(LHIPRC = lag(HIPRC))
#PRIOR DAY PRICE
sample <- sample %>%
group_by(IQ_ISSUER_CIQID) %>%
mutate(LPRC = lag(PRC))
f.adjust.overnight.pricechanges <- function(sample) {
#ADJUST WHEN PRIOR CLOSE IS BELOW CURRENT LOW
sample$THIPRC[sample$LPRC < sample$LOPRC &
sample$LPRC > 0 &
!is.na(sample$LPRC) & !is.na(sample$LOPRC)] <-
sample$HIPRC[sample$LPRC < sample$LOPRC &
sample$LPRC > 0 &
!is.na(sample$LPRC) & !is.na(sample$LOPRC)] -
(sample$LOPRC[sample$LPRC < sample$LOPRC &
sample$LPRC > 0 &
!is.na(sample$LPRC) &
!is.na(sample$LOPRC)] - sample$LPRC[sample$LPRC < sample$LOPRC &
sample$LPRC > 0 &
!is.na(sample$LPRC) & !is.na(sample$LOPRC)])
sample$TLOPRC[sample$LPRC < sample$LOPRC &
sample$LPRC > 0 &
!is.na(sample$LPRC) & !is.na(sample$LOPRC)] <-
sample$LPRC[sample$LPRC < sample$LOPRC &
sample$LPRC > 0 &
!is.na(sample$LPRC) & !is.na(sample$LOPRC)]
sample$RETADJ[sample$LPRC < sample$LOPRC &
sample$LPRC > 0 &
!is.na(sample$LPRC) & !is.na(sample$LOPRC)] <- 1
#ADJUST WHEN PRIOR CLOSE IS ABOVE CURRENT HIGH
sample$THIPRC[sample$LPRC > sample$HIPRC &
sample$LPRC > 0 &
!is.na(sample$LPRC) & !is.na(sample$HIPRC)] <-
sample$LPRC[sample$LPRC > sample$HIPRC &
sample$LPRC > 0 &
!is.na(sample$LPRC) & !is.na(sample$HIPRC)]
sample$TLOPRC[sample$LPRC > sample$HIPRC &
sample$LPRC > 0 &
!is.na(sample$LPRC) & !is.na(sample$HIPRC)] <-
sample$LOPRC[sample$LPRC > sample$HIPRC &
sample$LPRC > 0 &
!is.na(sample$LPRC) & !is.na(sample$HIPRC)] +
(sample$LPRC[sample$LPRC > sample$HIPRC &
sample$LPRC > 0 &
!is.na(sample$LPRC) &
!is.na(sample$HIPRC)] - sample$HIPRC[sample$LPRC > sample$HIPRC &
sample$LPRC > 0 &
!is.na(sample$LPRC) & !is.na(sample$HIPRC)])
sample$RETADJ[sample$LPRC > sample$HIPRC &
sample$LPRC > 0 &
!is.na(sample$LPRC) & !is.na(sample$HIPRC)] <- 2
return(sample)
}
sample <- f.adjust.overnight.pricechanges(sample)
##### 2.3 Calculate daily high-low spread estimates #####
# ----------------------------------------------------- #
# declare some variables & constants
v_PI <- pi
v_K <- 1 / (4 * log(2))
v_K1 <- 4 * log(2) # what is this for?? unused
v_K2 <- sqrt(8 / v_PI)
v_CONST <- 3 - (2 * sqrt(2))
sample <- transform(sample, LOPRC2 = pmin(sample$TLOPRC, sample$LLOPRC))
sample <- transform(sample, HIPRC2 = pmin(sample$THIPRC, sample$LHIPRC))
##### 2.3.1 CALCULATE DAILY HIGH-LOW SPREAD ESTIMATES
f.calculate.spread <- function(sample) {
sample$BETA <- 0
# Beta
sample$BETA[!is.na(sample$TLOPRC) > 0 & !is.na(sample$LLOPRC) > 0] <-
(log(sample$THIPRC[!is.na(sample$TLOPRC) > 0 & !is.na(sample$LLOPRC) > 0] / sample$TLOPRC[!is.na(sample$TLOPRC) > 0 & !is.na(sample$LLOPRC) > 0])) ^ 2 +
(log(sample$LHIPRC[!is.na(sample$TLOPRC) > 0 & !is.na(sample$LLOPRC) > 0] / sample$LLOPRC[!is.na(sample$TLOPRC) > 0 & !is.na(sample$LLOPRC) > 0])) ^ 2
# Gamma
sample$GAMMA[!is.na(sample$LOPRC2) > 0] <-
log(sample$HIPRC2[!is.na(sample$LOPRC2) > 0] / sample$LOPRC2[!is.na(sample$LOPRC2) > 0]) ^ 2
# Alpha
sample$ALPHA <- (sqrt(2 * sample$BETA) - sqrt(sample$BETA)) / v_CONST - sqrt(sample$GAMMA / v_CONST)
# Spread
sample$SPREAD <- 2 * (exp(sample$ALPHA) - 1) / (1 + exp(sample$ALPHA))
# set negative spread estimates to zero
sample <- transform(sample, SPREAD_0 = pmax(sample$SPREAD, 0))
# drop negative spread estimates
sample$SPREAD_MISS[!is.na(sample$SPREAD) > 0] <- sample$SPREAD[!is.na(sample$SPREAD) > 0]
# Sigma
sample$SIGMA <- ((sqrt(sample$BETA / 2) - sqrt(sample$BETA))) / (v_K2 * v_CONST) + sqrt(sample$GAMMA / (v_K2 * v_K2 * v_CONST))
# set negative sigma estimates to zero
sample <- transform(sample, SIGMA_0 = pmax(sample$SIGMA, 0))
# drop negative sigma estimates
return(sample)
}
sample <- f.calculate.spread(sample)
rm(v_CONST, v_K, v_K1, v_K2, v_PI)
##### 2.4 Output daily high-low spread estimates #####
# -------------------------------------------------- #
## OUTPUT VARIABLES: **;
## SPREAD = DAILY H-L SPREAD ESTIMATES WITH NEG ESTIMATES INCLUDED **;
## SPREAD_0 = DAILY H-L SPREAD ESTIMATES WITH NEG ESTIMATES SET TO ZERO **;
## SPREAD_MISS = DAILY H-L SPREAD ESTIMATES WITH NEG ESTIMATES SET TO MISSING **;
## SIGMA = DAILY STD. DEV. ESTIMATE WITH NEG ESTIMATES INCLUDED **;
## SIGMA_0 = DAILY STD. DEV. ESTIMATE WITH NEG ESTIMATES SET TO ZERO
HLSPRD_DAY_SAMPLE <- sample[, colnames(sample) %in% c("IQ_ISSUER_CIQID", "DATE", "SPREAD", "SPREAD_0", "SPREAD_MISS", "SIGMA", "SIGMA_0")]
##### 2.5 Calculate monthly and yearly high-low spread estimates #####
# ------------------------------------------------------------------ #
HLSPRD_DAY_SAMPLE$DATE1 <- as.Date(HLSPRD_DAY_SAMPLE$DATE, "%Y-%m-%d")
HLSPRD_DAY_SAMPLE$YEAR = as.numeric(format(HLSPRD_DAY_SAMPLE$DATE1, "%Y"))
HLSPRD_DAY_SAMPLE$MONTH = as.numeric(format(HLSPRD_DAY_SAMPLE$DATE1, "%m"))
save.image(file = "bid_ask.RData")
##### 2.5.1 Monthly average
# calculate # of days with trading activity per month
HLSPRD_DAY_SAMPLE <- HLSPRD_DAY_SAMPLE %>%
group_by(IQ_ISSUER_CIQID, YEAR, MONTH) %>%
mutate(TRADES_PER_MONTH = length(unique(SPREAD)))
# set all days within month on first of month for easier plotting later
HLSPRD_DAY_SAMPLE$DATE_MONTH <- paste("01-", strftime(as.POSIXct(HLSPRD_DAY_SAMPLE$DATE1), "%m-%Y"), sep = "")
HLSPRD_DAY_SAMPLE$DATE_MONTH <- as.Date(HLSPRD_DAY_SAMPLE$DATE_MONTH, "%d-%m-%Y")
# store all entries where # of trades per month < 12
low_trades_month <- subset(HLSPRD_DAY_SAMPLE, HLSPRD_DAY_SAMPLE$TRADES_PER_MONTH < 12)
# 56 % of all months have >= 12 active trading days
length(HLSPRD_DAY_SAMPLE$IQ_ISSUER_CIQID[HLSPRD_DAY_SAMPLE$TRADES_PER_MONTH >= 12]) / length(HLSPRD_DAY_SAMPLE$IQ_ISSUER_CIQID)
HLSPRD_DAY_SAMPLE_CLEANED <- subset(HLSPRD_DAY_SAMPLE, HLSPRD_DAY_SAMPLE$TRADES_PER_MONTH >= 12)
# calculate average monthly spreads & sigmas
spreads_monthly <- merge((aggregate(SPREAD_0 ~ IQ_ISSUER_CIQID + DATE_MONTH, HLSPRD_DAY_SAMPLE_CLEANED, FUN = mean)), (aggregate(SIGMA_0 ~ IQ_ISSUER_CIQID + DATE_MONTH, HLSPRD_DAY_SAMPLE_CLEANED, FUN = mean)))
spreads_monthly <- spreads_monthly[with(spreads_monthly, order(IQ_ISSUER_CIQID)),]
colnames(spreads_monthly) <- c("iq_issuer_ciqid", "date", "mean_spread", "mean_sigma")
# example plot
ggplot(spreads_monthly[spreads_monthly$IQ_ISSUER_CIQID == "IQ116231656",], aes(x = DATE_MONTH, y = MEAN_SPREAD_0, colour = "MEAN_SPREAD_0")) +
geom_line() +
geom_line(aes(y = MEAN_SIGMA_0, colour = "MEAN_SIGMA_0")) +
labs(x = "Date", y = "Mean sigma / spread", title = "CIQID: IQ116231656", colour = "Value")
##### 5.2 Yearly average
# calculate # of months with trading activity > 12 days
HLSPRD_DAY_SAMPLE <- HLSPRD_DAY_SAMPLE %>%
group_by(IQ_ISSUER_CIQID, YEAR) %>%
mutate(TRADES_PER_YEAR = length(unique(MONTH)))
# calculate # of active trading days per _year_
HLSPRD_DAY_SAMPLE <- HLSPRD_DAY_SAMPLE %>%
group_by(IQ_ISSUER_CIQID, YEAR) %>%
mutate(TRADES_PER_YEAR_IN_DAYS = length(unique(SPREAD)))
# store all entries where # of trades < 60
low_trades_year <- subset(HLSPRD_DAY_SAMPLE, HLSPRD_DAY_SAMPLE$TRADES_PER_YEAR_IN_DAYS < 60)
# 76 % of the iq_issuer_ciqids have 60 or more trading days per year
length(HLSPRD_DAY_SAMPLE$IQ_ISSUER_CIQID[HLSPRD_DAY_SAMPLE$TRADES_PER_YEAR_IN_DAYS >= 60]) / length(HLSPRD_DAY_SAMPLE$IQ_ISSUER_CIQID)
# set all months within year on 01.01. for easier plotting later
HLSPRD_DAY_SAMPLE$DATE_YEAR <- paste("01-01-", strftime(as.POSIXct(HLSPRD_DAY_SAMPLE$DATE1), "%Y"), sep = "")
HLSPRD_DAY_SAMPLE$DATE_YEAR <- as.Date(HLSPRD_DAY_SAMPLE$DATE_YEAR, "%d-%m-%Y")
HLSPRD_DAY_SAMPLE_CLEANED <- subset(HLSPRD_DAY_SAMPLE, HLSPRD_DAY_SAMPLE$TRADES_PER_YEAR_IN_DAYS >= 60)
# calculate average yearly spreads & sigmas
spreads_yearly <- merge((aggregate(SPREAD_0 ~ IQ_ISSUER_CIQID + DATE_YEAR, HLSPRD_DAY_SAMPLE_CLEANED, FUN = mean)), (aggregate(SIGMA_0 ~ IQ_ISSUER_CIQID + DATE_YEAR, HLSPRD_DAY_SAMPLE_CLEANED, FUN = mean)))
colnames(spreads_yearly) <- c("IQ_ISSUER_CIQID", "DATE_YEAR", "MEAN_SPREAD_0", "MEAN_SIGMA_0")
spreads_yearly <- spreads_yearly[with(spreads_yearly, order(IQ_ISSUER_CIQID)),]
colnames(spreads_yearly) <- c("iq_issuer_ciqid", "date", "mean_spread", "mean_sigma")
I now have the problem, that I get resulting spreads > 1 for some rare outliers (I'm applying the algorithm to municipal bond data in the US). Did I do a mistake? Or how do I have to interpret it?
Thanks a lot for your help!