#============================================================================# ##################### Velkommen til Benchmarking ########################### #============================================================================# # Version 2 - opdateret 4. september 2024 # Fjerner alle objekter fra arbejdsomraadet rm(list=ls()) # Saetter en fast tilfaeldig startvaerdi for at sikre reproducerbare resultater set.seed(23) # Indlaeser noedvendige pakker required_packages <- c("Benchmarking", "readxlsb", "frontier","ggplot2","lmtest","sandwich") for (i in required_packages){ if( !(i %in% installed.packages()) ){install.packages(i)} } library(Benchmarking) ; library(readxl) ; library(readxlsb) ; library(ggplot2) ; library(lmtest) ; library(sandwich) # Indlaeser data som baserer sig paa indberetninger for 2022 og 2023 setwd(choose.dir()) Data <- read_xlsb("Bilag 1 - Data til brug for fastlaeggelse af de individuelle effektiviseringskrav.xlsb", sheet = "Til R-koder") # Input og output i benchmarkingen Input_FROSSET <- Data$FATO_FROSSET Output_FROSSET <- cbind(Data$OPEX_KOR_FROSSET, Data$CAPEX_KOR_FROSSET) Input <- Data$FATO Output <- cbind(Data$OPEX_KOR, Data$CAPEX_KOR) #======================================================================# ################### KORRIGEREDE NETVOLUMEN ########################## #======================================================================# # OPEX-regressionen # Outliers til endelig OPEX-regression OPEX_outliers <- c("V091","V202", "V055") # "V091" - HOFOR Vand Koebenhavn A/S # "V202" - Vandfaellesskabet Nordvestsjaelland # "V055" - Frederiksberg Vand # Beregning af regressionsmodel for OPEX OPEX_Fit_out<-lm(Data$FADO_FROSSET[-which(Data$ID %in% OPEX_outliers)]/Data$OPEX_U_FROSSET[-which(Data$ID %in% OPEX_outliers)] ~Data$ALDER_FROSSET[-which(Data$ID %in% OPEX_outliers)]+Data$TAETHED_FROSSET[-which(Data$ID %in% OPEX_outliers)]) # Cook's Distance cbind(Data$NAVN[-which(Data$ID %in% OPEX_outliers)],Data$ID[-which(Data$ID %in% OPEX_outliers)],cooks.distance(OPEX_Fit_out))[order(cooks.distance(OPEX_Fit_out)),] # Endelige regressionsresultater for OPEX round(coef(OPEX_Fit_out),6) ; summary(OPEX_Fit_out) # CAPEX-regressionen # outliers til endelig CAPEX-regression CAPEX_outliers <- c("V091", "V111","V202", "V055") # "V111" - Kalundborg Overfladevand A/S # "V091" - HOFOR Vand Koebenhavn A/S # "V202" - Vandfaellesskabet Nordvestsjaelland # "V055" - Frederiksberg Vand # Beregning af regressionsmodel for CAPEX CAPEX_Fit_out<-lm(Data$INV_OMK_FROSSET[-which(Data$ID %in% CAPEX_outliers)]/Data$CAPEX_U_FROSSET[-which(Data$ID %in% CAPEX_outliers)] ~Data$ALDER_FROSSET[-which(Data$ID %in% CAPEX_outliers)]+Data$TAETHED_FROSSET[-which(Data$ID %in% CAPEX_outliers)]) # Cook's Distance cbind(Data$NAVN[-which(Data$ID %in% CAPEX_outliers)],Data$ID[-which(Data$ID %in% CAPEX_outliers)],cooks.distance(CAPEX_Fit_out))[order(cooks.distance(CAPEX_Fit_out)),] # Endelige regressionsresultater for CAPEX round(coef(CAPEX_Fit_out),6) ; summary(CAPEX_Fit_out) #======================================================================# ######################## DEA-MODELLEN ############################### #======================================================================# # Selskaber som VST konkret vurderer skal vaere outliers i aarets benchmarking uanset udfaldet af outlier tests OUT <- c("V054", "V055", "V070", "V091", "V098", "V164") Data[Data$ID %in% OUT,c("NAVN","ID")] #======================================================================# # Funktion til at beregne Mahalanobis Distance og Superefficiens calculate_outliers <- function(ID,CompanyName, Input, Output, OUT, p_value_cutoff = 0.95, Supereff = TRUE, SE_cutoff = 1.1, RTS = "crs", ORIENTATION = "in") { require("Benchmarking") require("ggplot2") # Opretter dataframe data <- na.omit(data.frame(Selskabsnavn = CompanyName, ID = ID, Input,Output)) # Beregner Mahalanobis-afstand data$distance <- mahalanobis(data[, -c(1, 2)], colMeans(data[, -c(1, 2)]), cov(data[, -c(1, 2)])) # Beregner cutoff-v?rdi i forhold til specificeret p-v?rdi cutoff <- qchisq(p = p_value_cutoff , df = ncol(data)-2) # Beregner p-v?rdier for Mahalanobis-afstande data$p_values <- pchisq(data$distance, df = ncol(data)-2, lower.tail = FALSE) # Fjerner observationer med p-v?rdi mindre end 1 - p_value_cutoff samt observationer, som pÃ¥ forhÃ¥nd vurderes til, at være outliers MahalanobisOut <- which(data$p_values <= (1 - p_value_cutoff) | data$ID %in% OUT) # I udgivelsen skal koden deles op if (Supereff) { data$SE <- sapply(1:nrow(Data),function(x){ dea(Input[x], cbind(Output[x,1],Output[x,2]), RTS = RTS, ORIENTATION = ORIENTATION, XREF = Input[-c(MahalanobisOut,x)], YREF = Output[-c(MahalanobisOut,x),])$eff }) # Fletter outliers baseret p? ID Outliers <- which(data$SE > SE_cutoff | data$ID %in% OUT) # Graf af Mahalanobis og SE Graph <- ggplot(data, aes(x = seq_along(distance), y = distance, color = SE>SE_cutoff)) + geom_point() + geom_hline(yintercept = cutoff, linetype = "dashed", color = "red") + geom_text(aes(label = ID), hjust = 1, vjust = -1.5, size = 2.5) + labs(x = "Observationer", y = "Mahalanobis Distance") + scale_color_manual(values = c("TRUE" = "red", "FALSE" = "blue"), labels = c("Non-outlier", "Outlier")) } else { # Graf af Mahalanobis Graph <- ggplot(data, aes(x = seq_along(distance), y = distance, color = distance > cutoff)) + geom_point() + geom_hline(yintercept = cutoff, linetype = "dashed", color = "red") + geom_text(aes(label = ID), hjust = 1, vjust = -1.5, size = 2.5) + labs(x = "Observationer", y = "Mahalanobis Distance") + scale_color_manual(values = c("TRUE" = "red", "FALSE" = "blue"), labels = c("Above Cutoff", "Below Cutoff")) Outliers <- which(data$distance > cutoff | data$ID == OUT) } return(list(Outliers = Outliers, Results = data,Graph = Graph,mahalanobisCutOff = cutoff)) } #======================================================================# # Beregninger statistiske outliers Outliers <- calculate_outliers(ID = Data$ID, CompanyName = Data$NAVN, Input_FROSSET, Output_FROSSET, OUT) # Identificeret statistiske outliers til brug for aarets benchmarking Data[Outliers$Outliers,c("NAVN","ID")] # Illustration af DEA-beregninger og outliers dea.plot(Output_FROSSET[-Outliers$Outliers,1]/Input_FROSSET[-Outliers$Outliers], Output_FROSSET[-Outliers$Outliers,2]/Input_FROSSET[-Outliers$Outliers], RTS="vrs",ORIENTATION="out",ylim=c(0,1),xlim=c(0,1.3)) points(Output_FROSSET[Outliers$Outliers,1]/Input_FROSSET[Outliers$Outliers], Output_FROSSET[Outliers$Outliers,2]/Input_FROSSET[Outliers$Outliers], col="red", pch=10) text(Output_FROSSET[Outliers$Outliers,1]/Input_FROSSET[Outliers$Outliers], Output_FROSSET[Outliers$Outliers,2]/Input_FROSSET[Outliers$Outliers], Data[Outliers$Outliers,"NAVN"], cex=0.6, pos=1, col="red") #======================================================================# ###################### Order-M-MODELLEN ############################# #======================================================================# #======================================================================# # Funktion til at beregne Order-M med tilhoerende lambdavaerdier OrderM <- function(ID, Navn, Input, Output, Input_FROSSET, Output_FROSSET, Outliers, B=3, SUPEREFF = "FALSE", M=round((dim(Data)[1]-length(Outliers))/3), RTS = "crs", ORIENTATION = "in"){ require("Benchmarking") require("dplyr") # Definerer printet OrderM_Eff_Scorer <- matrix(0, nrow = B, ncol = length(ID)) MyLambda <- list() for (i in 1:B){ # Vaer opmaerksom paa 'replace' er sat til TRUE M_bar <- unique(sample((1:length(ID))[-Outliers], M, replace = TRUE)) # Koerer Order-M for det valgte udsnit OrderM_Res <- dea(Input, Output, RTS = RTS, ORIENTATION = ORIENTATION, XREF = Input_FROSSET[M_bar], YREF = Output_FROSSET[M_bar,]) # Samler Order-M efficiensscorer OrderM_Eff_Scorer[i,] <- round(OrderM_Res$eff,digits = 7) #Beregning af lambda til brug for costdriveranalysen senere MyLambda[[i]] <- matrix(0,length(ID),length(ID),byrow = T) # Danner matrix MyLambda[[i]][,M_bar] <- OrderM_Res$lambda # Indsaetter lambdaer paa de rigtige pladser i matricen } OrderM_Scorer <- colMeans(OrderM_Eff_Scorer) LambdaMean <- Reduce("+",MyLambda)/B #Finder de gennemsnitlige lambdaer over alle iterationerne # Saetter OrderM-efficiensscorer over 1 ned til 1 if(SUPEREFF == "FALSE"){ OrderM_Scorer <- pmin(OrderM_Scorer,1) } return(list("Eff"=OrderM_Scorer,"LambdaMean"=LambdaMean)) cat("Gennemsnitlig effektivitet:", mean(OrderM_Scorer), "/n") } #======================================================================# OrderM_Res <- OrderM(ID=Data$ID, Navn=Data$NAVN, Input=Input, Output=Output, Input_FROSSET=Input_FROSSET, Output_FROSSET=Output_FROSSET, Outliers=Outliers$Outliers, B=10000,SUPEREFF = T) OrderM_Eff_front <- OrderM_Res$Eff OrderM_Eff <- ifelse(OrderM_Res$Eff > 1, 1, OrderM_Res$Eff) # Scorene fra Order-M bindes paa datasaettet Data$OrderM_score <- OrderM_Eff # Efficiente selskaber Data[Data$OrderM_score==1,c("NAVN","ID")] #======================================================================# #################### Cost Driver-analysen ########################### #======================================================================# #===================== Koersel af FROSSET Order-M =====================# # Koersel af Order-M igen, for at opnaa frosset lambda-vaerdier # Saetter en fast tilfaeldig startvaerdi for at sikre reproducerbare resultater set.seed(23) # Koer Order-M med frosset data OrderM_Res_FROSSET <- OrderM(ID=Data$ID, Navn=Data$NAVN, Input=Input_FROSSET, Output=Output_FROSSET, Input_FROSSET=Input_FROSSET, Output_FROSSET=Output_FROSSET, Outliers=Outliers$Outliers, B=10000,SUPEREFF = T) OrderM_Res_FROSSET_Eff <- ifelse(OrderM_Res_FROSSET$Eff > 1, 1, OrderM_Res_FROSSET$Eff) #============== Beregning af Ligning (1) - (2) i Bilag 3 ==============# # Vigtige variable der er givet paa forhaand z_u_FROSSET <- Data[,c("Boringer_FROSSET","Vandvaerker_FROSSET","Trykforoegerstationer_FROSSET","Rentvandsledning_FROSSET","Kunder_FROSSET","Generel.administration_FROSSET")] z_l_FROSSET <- sapply(c(1:ncol(z_u_FROSSET)),function(x){Data$Debiteret.vandmaende_FROSSET}) # Relative costdrivere Zstar_FROSSET <- sapply(1:ncol(z_u_FROSSET),function(x){ #Finder de relative costdrivere for hvert selskabs benchmark (vaegtet peers) rowSums(sweep(OrderM_Res_FROSSET$LambdaMean, 2, z_u_FROSSET[,x], "*"))/rowSums(sweep(OrderM_Res_FROSSET$LambdaMean, 2, z_l_FROSSET[,x], "*")) }) # Navngiver costdriverne igen colnames(Zstar_FROSSET) <- colnames(z_u_FROSSET) # Angiver differencen mellem selskabernes relative costdrivere og deres benchmark - Se ligning (2) i Bilag 3. ZDiff_FROSSET <- Zstar_FROSSET-(z_u_FROSSET/z_l_FROSSET) #===================== Funktion til Ligning (3)-(4) ==================# # Funktion til at beregne regressioner inkl. automatisk outliertest Regressioner <- function(Costdriver,CutOff){ #Funktion der laver en lineaer regression og fjerner selskaber med cooks.distance over en graensevaerdi MyLm <- lm(OrderM_Res_FROSSET_Eff ~ .,data = as.data.frame(ZDiff_FROSSET[,Costdriver])) #lineaer regression for alle selskaber Out <- NULL while(sum(cooks.distance(MyLm)>CutOff)>0){ #Tjekker om der er selskaber med hoej cooks.distance Out <- c(Out,as.numeric(names(which(cooks.distance(MyLm)>CutOff)))) #Tilfoejet selskaber med hoej cooks.distance til listen af selskaber der skal haandteres som outliers MyLm <- lm(OrderM_Res_FROSSET_Eff ~ .,data = as.data.frame(ZDiff_FROSSET[,Costdriver]),subset = c(-Out)) #Udregner lineaer regression uden outliers } return(summary(MyLm)) #Returnerer summary af den lineaere regression } #======================================================================# # Regressioner for hver enkelt cost driver - Se ligning (3) i Bilag 3. for (i in names(z_u_FROSSET)){ print(i) ; print(Regressioner(Costdriver = i,CutOff = 0.5))} # Samlet regression baseret paa de costdrivere der er signifikante hver for sig ovenfor - Se ligning (4) i Bilag 3. SignifikanteCostdrivere <- NULL for(i in 1:ncol(ZDiff_FROSSET)){ #Undersoeger hvilke regressioner ovenfor der er signifikante if(Regressioner(Costdriver = colnames(ZDiff_FROSSET)[i],CutOff = 0.5)$coefficients[2,4]<=0.05){ SignifikanteCostdrivere <- c(SignifikanteCostdrivere,colnames(ZDiff_FROSSET)[i]) } } # Tjekker om der er signifikante cost drivere SamletRegression <- Regressioner(Costdriver = SignifikanteCostdrivere, CutOff = 0.5) print(SamletRegression) #=========== Beregning af endelig cost-driver-kompensation ============# # Beregner relative costdrivere på endelig data, saa kun koefficienter til costdriver er frosset # Vigtige variable der er givet paa forhaand z_u <- Data[,c("Boringer","Vandvaerker","Trykforoegerstationer","Rentvandsledning","Kunder","Generel.administration")] z_l <- sapply(c(1:ncol(z_u)),function(x){Data$Debiteret.vandmaende}) # Relative costdrivere Zstar <- sapply(1:ncol(z_u),function(x){ #Finder de relative costdrivere for hvert selskabs benchmark (vaegtet peers) rowSums(sweep(OrderM_Res$LambdaMean, 2, z_u_FROSSET[,x], "*"))/rowSums(sweep(OrderM_Res$LambdaMean, 2, z_l_FROSSET[,x], "*")) }) # Navngiver costdriverne igen colnames(Zstar) <- colnames(z_u) # Angiver differencen mellem selskabernes relative costdrivere og deres benchmark - Se ligning (2) i Bilag 3. ZDiff <- Zstar-(z_u/z_l) # Beregner selskabernes kompensation paa baggrund af costdriveranalysen Kompensation <- rowSums(sweep(-ZDiff[,sub("_FROSSET","",SignifikanteCostdrivere)], 2, SamletRegression$coefficients[-1,1], "*")) # Selskaberne kan kun kompenseres opad, da costdriveranalysen skal betragtes som et forsigtighedshensyn Kompensation[Kompensation<0] <- 0 # Scorene efter costdriveranalysen saettes til maksimalt 1 Data$OrderM_score_Efter_CD <- ifelse(Data$OrderM_score+Kompensation>1,1,Data$OrderM_score+Kompensation) # Binder resultater paa datasaettets Data <- cbind(Data,Kompensation,OrderM_Res_FROSSET$LambdaMean) #======================================================================# ########################## Endelige scorer ############################# #======================================================================# Data[,c("NAVN","ID","OPEX_KOR","CAPEX_KOR","FATO")] Data[,c("NAVN","ID","OrderM_score","Kompensation","OrderM_score_Efter_CD")]