I want to fit a normal and a pareto distribution to some data I collected. I will insert the codes for all three types of fitting below. You will notice that they are quite similar, as the concept of the fitting is the same. I inserted the data as a vector in each code.
I have three types of fitting: (1) my data and normal distribution (2) my data and pareto distribution (in which I use the smallest value in my data as the starting point, as the pareto distribution can only be positive) (3) my data and pareto distribution (in which I only use the absolute value, so that I don't have negative data)
My data has nearly the distribution of a normal distribution, so it is clear that the pareto distribution does not fit my data. However, I still have to find the best "fitting" pareto distribution for my data.
My problem is that the parameters I get in the end are partially the same ones I used in the beginning. I thought the parameters would be different after the fitting. In the beginning I set the parameters as a starting point. In (3), the values are different, but in (1) and (2) they don't.
If you have any idea, why the parameters in (1) and (2) don't change after the fitting, I would be very grateful!
This is my R-Code for (1): My data and normal distribution
rm(list = ls()) # Clear environment
cat("\014") # Clear console
#install.packages("tidyverse")
#install.packages("optimization")
#install.packages("moments")
#install.packages("ptsuite")
#install.packages("ggpubr")
library(nloptr)
library(moments)
library(tidyverse)
library(optimization)
library(ggplot2)
library(ptsuite)
library(ggpubr)
library(readxl)
# My data and normal distribution -------------------------------------------------
simulations<-5000 #Set desired size of simulated distribution for fitting
number.bins<-21 #Set number of bins for fitting
# Fix random process for reproducible results -----------------------------
set.seed(1)
calculate.normal <- function(x) {
para.cases <- x[1]
para.steps <- x[2]
para.mean <- x[3]
para.sd <- x[4]
normal.vector <- c(para.cases) #Result of the Function, e.g. Vector with 20 bins/cases
for (i in 1:para.cases) {
x<-para.mean-3.0*para.sd+((i-1)*para.steps)
normal.vector[i] <- (1/(sqrt(2*pi*para.sd^2))*exp((-1/2)*((x-para.mean)/para.sd)^2))
i <- i + para.steps
}
return(normal.vector)
}
# generate data base ------------------------------------------------------
# Your set of values
data.vector <- c(0.00000000, 0.00000000, 0.00000000, 0.57543070, -0.60346822, 0.00000000, -0.75467120, -2.87598944, -0.69492436, 0.00000000,
-0.39135436, 0.00000000, 2.08477309, -1.03078570, -0.93325051, 2.33102484, 0.00000000, -3.14602489, -4.48954942, 0.30357000,
-2.45110053, 0.00000000, 0.00000000, 0.69492436, -0.99849437, -0.69492436, 0.00000000, 1.45017758, -3.41788558, -0.39348127,
1.48431547, 3.86714022, 0.00000000, -0.96678506, 0.96678506, -1.23806373, 0.00000000, -2.45110053, 0.00000000, 0.00000000,
0.21153184, 2.45110053, 4.82666087, -1.75838793, -0.05974683, -2.35663378, -0.75892501, 0.00000000, 0.00000000, -0.78696253,
3.58676501, -1.38984873, -0.96678506, -0.69492436, 1.23806373, 0.45535501, 2.14510195, 2.45110053, 0.03016443, -0.93325051,
0.54313936, 1.69341873, 0.75525322, -0.96678506, -2.45110053, -2.29931552, -0.06400065, 0.75892501, 0.60772203, 0.21153184,
-1.58185067, -1.69584731, -0.69492436, -1.60439116, -0.30357000, 0.00000000, 4.56206459, 1.66170942, 0.00000000, -0.51143005,
-1.11857006, 5.22836941, 0.00000000, -1.60439116, 0.69492436, 0.00000000, -0.15178500, 0.05974683, 0.00000000, 0.00000000,
-1.39227730, 1.45017758)
#Create output vectors
simulation.vector<-c()
data.density<-c()
# generate frequency distributions ----------------------------------------
#generate frequency distributions. trunc and ceiling make sure that all values are in the interval (round leads to exclusion)
histogram.absolute<-hist(data.vector, breaks=seq(floor(min(data.vector)),ceiling(max(data.vector)),l=number.bins),
col='lightgrey', xlab = "Score", face= "bold",
ylab = "Fequency", main = "", font.lab = 2,
cex.lab = 1.5, xlim = c(-5, 7), ylim = c(0, 30))
axis(side=1, at=seq(-5, 6, by=1), lwd.ticks = 1)
axis(side=2, at=seq( 0, 30, by=5), lwd.ticks = 1)
# extract case numbers for calculating percentages
#we extract the counts for each bin of the actual data
cases.data<-sum(histogram.absolute$counts)
data.density.p<-histogram.absolute$counts/cases.data
#Make a Histogram of relative frequencies
barplot(data.density.p, col='blue', xaxes = FALSE, axis.lty = 25, ylim = c(0, 0.35),
las = 1, space = 0, ylab = "relative Frequency", main = "", face= "bold", font.lab = 2,
cex.lab = 1.4, cex.axis = 0.5)
#axis(side=1, at=seq( 0, 26, by=1), lwd.ticks = 1)
#axis(side=2, lwd.ticks = 1, labels = c("hello", "0.05", "0.10", "0.15", "0.20", "0.25", "0.30", "0.35"))
view(data.density.p)
# now we have the original distribution histogram, but with relative frequencies instead of absolute numbers (counts)
#Define function
Fit.To.normal <- function(x, data.density.p) {
para.steps <- x[1]
para.mean <- x[2]
para.sd <- x[3]
daten <- data.density.p
simulated.vector <- calculate.normal(c(20, para.steps, para.mean, para.sd))
#generate a PERFECT normal distribution with given parameters (but 20 bins fixed, why?)
simulated.vector.p <- simulated.vector / sum(simulated.vector)
# obtain the relative frequencies of the above PERFECT Pareto distribution
delta <- (daten - simulated.vector.p)
#difference between real data and simulated function (data.density.p, with fixed shape and scale)
delta.sq <- delta^2
SRMSE <- sqrt(mean(delta.sq))
#view(mean(data.density.p))
#view(mean(simulated.vector.p))
barplot(data.density.p, col='blue', xlim=c(0,23), space = 0, xaxes = FALSE, axis.lty = 25, ylim = c(0, 0.35),
las = 1, ylab = "relative Frequency", main = "", face= "bold", font.lab = 2,
cex.lab = 1.4, cex.axis = 0.5)
#axis(side=1)
barplot(simulated.vector.p, col = 'red', add=T, xaxes = FALSE, axis.lty = 25, ylim = c(0, 0.35),
las = 1, space = 0, ylab = "relative Frequency", main = "", face= "bold", font.lab = 2,
cex.lab = 1.4, cex.axis = 0.5) #red is simulated distribution, blue is real data
barplot(data.density.p, col='blue', xlim=c(0,23), add=T, xaxes = FALSE, axis.lty = 25, ylim = c(0, 0.35),
las = 1, space = 0, ylab = "relative Frequency", main = "", face= "bold", font.lab = 2,
cex.lab = 1.4, cex.axis = 0.5)
barplot(simulated.vector.p, col=rgb(1,0,0,0.5), add=T, xaxes = FALSE, axis.lty = 25, ylim = c(0, 0.35),
las = 1, space = 0, ylab = "relative Frequency", main = "", face= "bold", font.lab = 2,
cex.lab = 1.4, cex.axis = 0.5)
#axis(side = 1, at = seq(0,23,1)#seq_along(data.density.p)
#, tick = TRUE)
return(SRMSE)
}
view(data.density.p)
#Fitting
passung <- optim(c(1, 1, 1),
fn = Fit.To.normal,
data.density.p = data.density.p,
method = "L-BFGS-B",
lower = c(1, 0.1, 0.1),
upper = c(10, 10, 10))
print(passung) >
This is my R-Code for (2): My data and pareto distribution
rm(list = ls()) # Clear environment
cat("\014") # Clear console
#install.packages("tidyverse")
#install.packages("optimization")
#install.packages("moments")
#install.packages("ptsuite")
#install.packages("ggpubr")
#install.packages("patternplot")
library(nloptr)
library(moments)
library(tidyverse)
library(optimization)
library(ggplot2)
library(patternplot)
library(ptsuite)
library(ggpubr)
library(readxl)
# my data and pareto distribution -------------------------------------------------------------
data.vector <- c(0.00000000, 0.00000000, 0.00000000, 0.57543070, -0.60346822, 0.00000000, -0.75467120, -2.87598944, -0.69492436, 0.00000000,
-0.39135436, 0.00000000, 2.08477309, -1.03078570, -0.93325051, 2.33102484, 0.00000000, -3.14602489, -4.48954942, 0.30357000,
-2.45110053, 0.00000000, 0.00000000, 0.69492436, -0.99849437, -0.69492436, 0.00000000, 1.45017758, -3.41788558, -0.39348127,
1.48431547, 3.86714022, 0.00000000, -0.96678506, 0.96678506, -1.23806373, 0.00000000, -2.45110053, 0.00000000, 0.00000000,
0.21153184, 2.45110053, 4.82666087, -1.75838793, -0.05974683, -2.35663378, -0.75892501, 0.00000000, 0.00000000, -0.78696253,
3.58676501, -1.38984873, -0.96678506, -0.69492436, 1.23806373, 0.45535501, 2.14510195, 2.45110053, 0.03016443, -0.93325051,
0.54313936, 1.69341873, 0.75525322, -0.96678506, -2.45110053, -2.29931552, -0.06400065, 0.75892501, 0.60772203, 0.21153184,
-1.58185067, -1.69584731, -0.69492436, -1.60439116, -0.30357000, 0.00000000, 4.56206459, 1.66170942, 0.00000000, -0.51143005,
-1.11857006, 5.22836941, 0.00000000, -1.60439116, 0.69492436, 0.00000000, -0.15178500, 0.05974683, 0.00000000, 0.00000000,
-1.39227730, 1.45017758)
data.pareto <- data.vector + abs(min(data.vector)) + 1
#view(data.pareto)
pareto_test(data.pareto)
# size and bin definition ----------------------------------------------------------
simulations<-5000 #Set desired size of simulated distribution for fitting
number.bins<-21 #Set number of bins for fitting
# Fix random process for reproducible results -----------------------------
#Making a "perfect" Pareto-distribution
#para.cases: how man cases
#para.steps: how fine the distribution should be
#para.shape: shape
#para.scale: xmin
#need %, not absolute frequency
#Function is protected from outer vectors
set.seed(1)
calculate.pareto <- function(x) {
para.cases <- x[1]
para.steps <- x[2]
para.shape <- x[3]
para.scale <- x[4]
pareto.vector <- c(para.cases) #Vector with 20 bins
for (i in 1:para.cases) {
x<-para.scale+((i-1)*para.steps)
pareto.vector[i] <- (para.shape * para.scale^para.shape) / (x^(para.shape+1)) #Calculating pareto value and saving it in the second column of the matrix
i <- i + para.steps
}
return(pareto.vector)
#view(calculate.pareto(c(20,20,1,10))) # gibt eine Spalte mit 20 Einträgen, die perfekt Pareto verteilt sind
}
# generate data base --------
hist(data.vector)
# standardize data vector such that the smallest value is 0 ---------------
min.data<-min(data.vector)
max.data<-max(data.vector)
data.vector.adjusted<-data.vector-min.data
hist(data.vector.adjusted)
#haben eine Pareto-Vtlg, die bei 0 anfängt und simuliert ist (nicht perfekt)
#Create output vectors ????
simulation.vector<-c()
data.density<-c()
# generate frequency distributions ----------------------------------------
#generate frequency distributions. trunc and ceiling make sure that all values are in the interval (round leads to exclusion)
histogram.info.data<-hist(data.vector.adjusted, breaks=seq(trunc(min(data.vector.adjusted)),ceiling(max(data.vector.adjusted)),l=number.bins),
col='green')
# extract case numbers for calculating percentages
cases.data<-sum(histogram.info.data$counts)
#we extract the counts for each bin of the actual data
data.density.p<-histogram.info.data$counts/cases.data
barplot(data.density.p, col='blue')
# now we have the original distribution histogram, but with relative frequencies instead of absolute numbers (counts)
#data.density is the relative distribution of a unreal pareto distribution
# Define Fitting Function -----------------------------------------------------------------
Fit.To.Pareto <- function(x, data.density.p) {
para.steps <- x[1]
para.shape <- x[2]
para.scale <- x[3]
daten <- data.density.p
# generate a PERFECT pareto distribution with given parameters ------------
simulated.vector <- calculate.pareto(c(20, para.steps, para.shape, para.scale))
#generate a PERFECT pareto distribution with given parameters
simulated.vector.p <- simulated.vector / sum(simulated.vector)
# obtain the relative frequencies of the above PERFECT Pareto distribution
# difference of simulated and real data ------------------------------------
delta <- (daten - simulated.vector.p)
#difference between real data (data.density.p, with fixed shape and scale)
#and simulated data with the fixed parameters
delta.sq <- delta^2
SRMSE <- sqrt(mean(delta.sq))
barplot(data.density.p, col='blue')
barplot(simulated.vector.p, col = 'red', add=T)
barplot(data.density.p, col='blue', add=T)
barplot(simulated.vector.p, col=rgb(1,0,0,0.5), add=T)
#axis(side = 1, at = seq_along(data.density.p) - 0.5, tick = TRUE)
return(SRMSE)
}
# Optimization Fitting -----------------------------------------------------
passung <- optim(c(1, 1, 1),
fn = Fit.To.Pareto,
data.density.p = data.density.p,
method = "L-BFGS-B",
lower = c(1, 0.01, 0.1),
upper = c(20, 10, 40)) #needs to be adjusted maybe?
print(passung)
This is my R-code for (3): pareto distribution with absolute values
library(moments)
library(tidyverse)
library(optimization)
library(ggplot2)
library(ptsuite)
library(ggpubr)
library(readxl)
library(plotly)
library(psych)
library(ggdendro)
library(dplyr)
library(Metrics)
library(jmv)
library(sqldf)
data.vector <- c(0.00000000, 0.00000000, 0.00000000, 0.57543070, -0.60346822, 0.00000000, -0.75467120, -2.87598944, -0.69492436, 0.00000000,
-0.39135436, 0.00000000, 2.08477309, -1.03078570, -0.93325051, 2.33102484, 0.00000000, -3.14602489, -4.48954942, 0.30357000,
-2.45110053, 0.00000000, 0.00000000, 0.69492436, -0.99849437, -0.69492436, 0.00000000, 1.45017758, -3.41788558, -0.39348127,
1.48431547, 3.86714022, 0.00000000, -0.96678506, 0.96678506, -1.23806373, 0.00000000, -2.45110053, 0.00000000, 0.00000000,
0.21153184, 2.45110053, 4.82666087, -1.75838793, -0.05974683, -2.35663378, -0.75892501, 0.00000000, 0.00000000, -0.78696253,
3.58676501, -1.38984873, -0.96678506, -0.69492436, 1.23806373, 0.45535501, 2.14510195, 2.45110053, 0.03016443, -0.93325051,
0.54313936, 1.69341873, 0.75525322, -0.96678506, -2.45110053, -2.29931552, -0.06400065, 0.75892501, 0.60772203, 0.21153184,
-1.58185067, -1.69584731, -0.69492436, -1.60439116, -0.30357000, 0.00000000, 4.56206459, 1.66170942, 0.00000000, -0.51143005,
-1.11857006, 5.22836941, 0.00000000, -1.60439116, 0.69492436, 0.00000000, -0.15178500, 0.05974683, 0.00000000, 0.00000000,
-1.39227730, 1.45017758)
data.vector.abs <- abs(data.vector)
# Pareto-Test -------------------------------------------------------------
data.pareto.mirror <- data.vector.abs + 1
#view(data.pareto.spiegel)
pareto_test(data.pareto.mirror)
hist(data.vector.abs, breaks = 20, xlab = "score", face= "bold",
ylab = "frequency", main = "", font.lab = 2,
cex.lab = 1.5, col = "grey", xlim = c(0, 6), ylim = c(0, 30))
axis(side=1, at=seq(0, 6, by=1), lwd.ticks = 1)
axis(side=2, at=seq( 0, 30, by=5), lwd.ticks = 1)
# Builing a distribution ------------------------------------------------------------
#from Pareto-Fitting
simulations<-5000 #Set desired size of simulated distribution for fitting
number.bins<-21 #Set number of bins for fitting
# Fix random process for reproducible results -----------------------------
#Making a "perfect" Pareto-distribution
#para.cases: how man cases
#para.steps: how fine the distribution should be
#para.shape: shape
#para.scale: xmin
#need %, not absolute frequency
#Function is protected from outer vectors
set.seed(1)
calculate.pareto <- function(x) {
para.cases <- x[1]
para.steps <- x[2]
para.shape <- x[3]
para.scale <- x[4]
pareto.vector <- c(para.cases) #Vector with 20 bins
for (i in 1:para.cases) {
x<-para.scale+((i-1)*para.steps)
pareto.vector[i] <- (para.shape * para.scale^para.shape) / (x^(para.shape+1)) # Calculating pareto value and save it in the second column of the matrix
i <- i + para.steps
}
return(pareto.vector)
}
# generate data base --------
data.vector<- data.vector.abs
# standardize data vector such that the smallest value is 0 ---------------
min.data<-min(data.vector)
max.data<-max(data.vector)
data.vector.adjusted<-data.vector-min.data
hist(data.vector.adjusted)
simulation.vector<-c()
data.density<-c()
# generate frequency distributions ----------------------------------------
#generate frequency distributions. trunc and ceiling make sure that all values are in the interval (round leads to exclusion)
histogram.info.data<-hist(data.vector.adjusted, breaks=seq(trunc(min(data.vector.adjusted)),ceiling(max(data.vector.adjusted)),l=number.bins),
col='lightgrey', xlab = "score", face= "bold",
ylab = "frequency", main = "", font.lab = 2,
cex.lab = 1.5)
#view(data.vector.adjusted)
#view(histogram.info.data)
# extract case numbers for calculating percentages
cases.data<-sum(histogram.info.data$counts)
#we extract the counts for each bin of the actual data
data.density.p<-histogram.info.data$counts/cases.data
barplot(data.density.p, col='blue')
# now we have the original distribution histogram, but with relative frequencies instead of absolute numbers (counts)
#data.density is the relative distribution of a unreal pareto distribution
# Define Fitting Function -----------------------------------------------------------------
Fit.To.Pareto <- function(x, data.density.p) {
para.steps <- x[1]
para.shape <- x[2]
para.scale <- x[3]
daten <- data.density.p
# generate a PERFECT pareto distribution with given parameters ------------
simulated.vector <- calculate.pareto(c(20, para.steps, para.shape, para.scale))
#generate a PERFECT pareto distribution with given parameters
simulated.vector.p <- simulated.vector / sum(simulated.vector)
# obtain the relative frequencies of the above PERFECT Pareto distribution
# difference of simulated and real data ------------------------------------
delta <- (daten - simulated.vector.p)
#difference between real data (data.density.p, with fixed shape and scale)
#and simulated data with the fixed parameters
delta.sq <- delta^2
SRMSE <- sqrt(mean(delta.sq))
barplot(data.density.p, col='blue')
barplot(simulated.vector.p, col = 'red', add=T)
barplot(data.density.p, col='blue', add=T)
barplot(simulated.vector.p, col=rgb(1,0,0,0.5), add=T)
return(SRMSE)
}
# Optimization Fitting -----------------------------------------------------
passung <- optim(c(1, 1, 1),
fn = Fit.To.Pareto,
data.density.p = data.density.p,
method = "L-BFGS-B",
lower = c(1, 0.1, 0.1),
upper = c(10, 10, 10)) #needs to be adjusted maybe?
print(passung)