Introduction

Several cultivars from literature within Zambia and Malawi were tested and compared to observed production information in a bid to select a dominant variety for maize yield prediction.

Reference data

Load malawi’s ministry of agriculture maize forecasts data as reference.

Load cultivars data. Basically, 12 maize cultivars were tested between 2010–2022 to

##  [1] "Year"              "Division"          "District"         
##  [4] "EPA"               "Season.start.date" "Season.end.date"  
##  [7] "Total.Area_ha"     "Yield_kg_ha"       "Production_MT"    
## [10] "Comments"          "yield_MT_ha"

Convert District and province names to upper case.

## NULL
Districts
x
BALAKA
BLANTYRE
CHIKWAWA
CHIRADZULU
CHITIPA
DEDZA

Cultivars production

Load information on different cultivars production potential as predicted by RHEA.

files <- list.files(paste0(root, "Cultivars_Selection/malawi/"), full.names = T)
f <- read.csv(files[1], stringsAsFactors =  FALSE)
df_list <- list()
for(i in 1:length(files)){
  f <- read.csv(files[i], stringsAsFactors =  FALSE)
  f$cultivar <- sub("\\_.*", "", basename(files[i]))
  temp <- sub("_v2", "", tools::file_path_sans_ext(basename(files[i])))
  f$Year <- sub(".*\\_|.csv.*", "", temp)
  df_list[[i]] <- f
}

df <- do.call(rbind, df_list)
df$cname <- toupper(df$cname)
df$gwad <- df$gwad/1000

Check missing names.

c <- sort(unique(ref$District))
c[!c %in% sort(unique(df$cname))]
## [1] "DOWA EAST"     "DOWA WEST"     "DOWAEAST"      "LILONGWE EAST"
## [5] "LILONGWE WEST" "MZIMBA NORTH"  "MZIMBA SOUTH"  "NKHATABAY"

Correct spellings

df$cname[df$cname=="DOWA EAST"] <- "DOWA EAST"
df$cname[df$cname=="DOWAEAST"]  <- "DOWA EAST"
df$cname[df$cname=="DOWAEST"]   <- "DOWA EAST"
c <- sort(unique(ref$District))
c[!c %in% sort(unique(df$cname))]
## [1] "DOWA EAST"     "DOWA WEST"     "DOWAEAST"      "LILONGWE EAST"
## [5] "LILONGWE WEST" "MZIMBA NORTH"  "MZIMBA SOUTH"  "NKHATABAY"

Merge RHEAS and observed MOA data

rh_q <- aggregate(gwad~cname+cultivar+Year, data=df, FUN = 'quantile', probs=c(0.25, 0.5, .75, .95), na.rm=T)
rh_q <- do.call(data.frame, rh_q)
names(rh_q) <- gsub("\\.","",names(rh_q))
rh_m <-aggregate(gwad~cname+cultivar+Year, data=df, FUN = mean, na.rm=T) #aggregate(df[,"gwad"], df[,c("cname","cultivar", "Year")], mean, na.rm=T)
names(rh_m)[4] <- "gwadmean"
rh <- rh_q
rh$gwadmean <- rh_m$gwadmean
names(rh)[1] <- "District"
ref.agg <-aggregate(yield_MT_ha~District+Year, data=ref, FUN = mean, na.rm=T)
dff <- merge(ref.agg,rh, by=c("District", "Year"))

Validation

RMSE

We can use the Root Mean Square Error (RMSE) and mean absolute percentage error (MAPE) to evaluate the models accuracy. RMSE is given as:

\[ \text{RMSE} = \sqrt{\frac{1}{n} \sum_{i=1}^n y-\widehat{y}}, \] where \(\widehat{y}\) and \(y\) are predicted yields and observed yields respectively while n is the number of fitted points.

RRMSE

What about Relative root mean square error (RRMSE):

\[ \text{RRMSE} = \frac{\sqrt{\frac{1}{n} \sum_{i=1}^n y-\widehat{y}}}{\frac{1}{n} \sum_{i=1}^n y}, \]

According to Li et al. 2013, the performance of the model is excellent when RRMSE < 10%; good if 10% < RRMSE < 20%; fair if 20% < RRMSE < 30%; and poor if RRMSE ≥ 30%.

Compute validation measures

Here validation measures are computed for each cultivar at district level based on historically observed maize yields.

val <- data.frame()
Cultivar <- sort(unique(dff$cultivar))
dists <- sort(unique(dff$District))
for(i in 1:length(Cultivar)){
  for(j in 1:length(dists)){
    #print(paste0("cultivar",i, " in ", dists[j]))
    temp <- subset(dff, cultivar==paste0("cultivar",i) & District==dists[j])
    rmse_m <- rmse(temp$gwadmean-temp$yield_MT_ha)
    rrmse_m <- rrmse(temp$gwadmean, temp$yield_MT_ha)
    pcor_m <- cor(temp$gwadmean, temp$yield_MT_ha, method = "pearson")
    bias <- mean(temp$gwadmean-temp$yield_MT_ha)
    rmse25 <- rmse(temp$gwad25-temp$yield_MT_ha)
    rrmse25 <- rrmse(temp$gwad25, temp$yield_MT_ha)
    rmse50 <- rmse(temp$gwad50-temp$yield_MT_ha)
    rrmse50 <- rrmse(temp$gwad50, temp$yield_MT_ha)
    rmse75 <- rmse(temp$gwad75-temp$yield_MT_ha)
    rrmse75 <- rrmse(temp$gwad75, temp$yield_MT_ha)
    rmse95 <- rmse(temp$gwad95-temp$yield_MT_ha)
    rrmse95 <- rrmse(temp$gwad95, temp$yield_MT_ha)
    output <- c(i, dists[j], rmse_m, rrmse_m, pcor_m, rmse25, bias, rrmse25, rmse50, rrmse50, rmse75, rrmse75, rmse95, rrmse95)
    val <-  rbind(val, output)
  }
  
}


colnames(val) <-c("Cultivar", "District", "RMSE_mean", "RRMSE_mean", "pcor", "bias", "RMSE25", "RRMSE25", "RMSE50", "RRMSE50", "RMSE75", "RRMSE75", "RMSE95", "RRMSE95")

#knitr::kable(val,caption = "Cultivar assessement based on ensemble mean, 25%. 50%, 75% and 95% percentiles respectively.")

Find cultivars with least RMSE in each district.

vals <- data.frame()
for(i in 1:length(dists)){
  #Flter out district
  temp <- subset(val, District==dists[i])
  #Order dataframe in decreasing correlation value
  index <- order(temp$pcor, decreasing = T)
  #pick dataframe with first four highest correlation
  temp <- temp[index[1:4],]
  output <- c(temp$Cultivar[which.min(temp$RMSE_mean)], dists[i], min(temp$RMSE_mean))
  vals <-  rbind(vals, output)
}


colnames(vals) <-c("Cultivar", "District", "RMSE_mean")
vals$RMSE_mean <- as.numeric(vals$RMSE_mean)

knitr::kable(vals, caption = "Cultivar assessement based on ensemble RMSE of average yields from RHEAS ensemble.")
Cultivar assessement based on ensemble RMSE of average yields from RHEAS ensemble.
Cultivar District RMSE_mean
4 BALAKA 1.9476243
7 BLANTYRE 1.0963509
6 CHIKWAWA 0.9564860
6 CHIRADZULU 0.5628901
4 CHITIPA 0.9194430
4 DEDZA 1.0916843
6 KARONGA 1.4471756
3 KASUNGU 1.2141267
4 LIKOMA 1.7249400
4 MACHINGA 1.6977333
6 MANGOCHI 1.1892109
3 MCHINJI 1.1222119
11 MULANJE 1.3987127
1 MWANZA 1.8924553
3 MZIMBA 0.8027455
6 NENO 0.6600554
4 NKHOTAKOTA 0.3653033
6 NSANJE 1.7372848
6 NTCHEU 0.6221345
1 NTCHISI 0.6699413
1 PHALOMBE 0.8223237
4 RUMPHI 0.5433124
4 SALIMA 1.0642157
6 THYOLO 0.7453994
6 ZOMBA 0.5932956
knitr::kable(sort(unique(vals$Cultivar)), caption = "Revelevant cultivars for Malawi")
Revelevant cultivars for Malawi
x
1
11
3
4
6
7

Visualize RMSE spatial distribution.

Cultivars 4, 6 and 7 seem to show lower RMSE on mean of RHEAS ensemble yield forecasts. These cultivars corresponds to the following names.

No. Cultivar Name
1 SC627
2 SC403
3 PAN53
4 DKC8053
5 PHB30G1
6 DKC8033
7 SC627
8 ZMS 606
9 PHB 30G19
10 PHB 30B50
11 SC513
12 MRI624