""" This module implements Geographical-XGBoost for spatially local regression.
The module contains the following functions:
- `create_param_grid` - Returns the grid of up to three hyperparameters.
- `nestedCV` - Returns the optimized hyperparameters' values and generalization error of XGBoost.
- `global_xgb` - Returns the global XGBoost model.
- `optimize_bw` - Returns the optimized bandwidth value.
- `gxgb` - Returns geographical XGBoost, local prediction and related statistics.
- `predict_gxgb` - Returns prediction in unseen data.
"""
# Import libs
import numpy as np
import pandas as pd
from numpy import mean
from numpy import std
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from scipy.spatial import distance_matrix
[docs]
def create_param_grid(Param1, Param1_Values, Param2=None, Param2_Values=None, Param3=None, Param3_Values=None):
""" Creates a grid of up to three hyperparameters for tuning.
Examples:
>>> Param1='n_estimators'
>>> Param1_Values = [100, 200, 300,500]
>>> Param2='learning_rate'
>>> Param2_Values = [0.1, 0.05,0.01]
>>> Param3='max_depth'
>>> Param3_Values = [2,3,4,6]
>>> create_param_grid(Param1,Param1_Values,Param2,Param2_Values,Param3,Param3_Values)
:param Param1: 1st hyperparameter name e.g., 'n_estimators'
:param Param1_Values: values for search e.g., [100, 200, 500]
:param Param2: 2nd hyperparameter name e.g., 'learning_rate'. Default=None.
:param Param2_Values: values for search e.g., [0.1, 0.05,0.01]. Default=None.
:param Param3: 3rd hyperparameter name e.g., 'max_depth'. Default=None.
:param Param3_Values: values for search e.g., [2,3,4,6]. Default=None.
:return: param_grid. Can be used in nestedCV function to fine tune hyperparameters
Change the argument of Param1, Param2, or Param3 with other hyperparamters available to tune XGBoost, such as:
subsample, colsample_bytree, lambda, alpha etc.
For example:
Param1= 'subsmample'
Param1_Values = [0.5, 0.7, 0.9]
A complete list of hyperparameters can be found here:
https://xgboost.readthedocs.io/en/stable/parameter.html
Tip: This function can be iteratively repeated with different sets of hyperparameters.
See an example in GXGB_call_demo.py at the DemoGXGBoost in GitHub.
"""
param_grid = {} # Create params grid
if Param1 is not None:
param_grid[Param1] = Param1_Values
if Param2 is not None:
param_grid[Param2] = Param2_Values
if Param3 is not None:
param_grid[Param3] = Param3_Values
return param_grid
[docs]
def nestedCV(X, y, param_grid, Param1, Param2=None, Param3=None, params=None, path_save=False, n_OuterSplits=5,
n_InnerSplits=3):
""" Applies nested cross validation for tuning up to three hyperparameters and calculating model generalization error.
:param X: dataframe with the independent variables values
:param y: dataframe with the dependent variable values
:param params: initial hyperparameter values. Type:dictionary
:param param_grid: grid values - output of param_grid function
:param Param1: name of 1st hyperparameter used (same as param_grid function)
:param Param2: name of 2nd hyperparameter used in param_grid function. Default=None.
:param Param3: name of 3rd hyperparameter used in param_grid function. Default=None.
:param path_save: output folder. Default=False.
:param n_OuterSplits: number of outer splits. Default=5.
:param n_InnerSplits: number if inner splits Default=3.
:return: optimized hyperparameters' values and generalization error of model through nestedCV
"""
# Default values for params for XGBoost and GXGBoost
defaultparams = {
'n_estimators': 100,
'learning_rate': 0.3,
'max_depth': 6,
'min_child_weight': 1,
'gamma': 0,
'subsample': 1,
'colsample_bytree': 1,
'reg_alpha': 0,
'reg_lambda': 1,
}
if params is None:
params = defaultparams
print(params)
Best_Param1 = list()
Best_Param2 = list()
Best_Param3 = list()
count = 0
# Outer loop
cv_outer = KFold(n_splits=n_OuterSplits, shuffle=True, random_state=1)
outer_resultsR2 = list()
outer_resultsRMSE = list()
outer_resultsMAE = list()
print("=================Nested CV process===========================================")
# Inner loop
for train_ix, test_ix in cv_outer.split(X):
# split data
X_train, X_test = X.iloc[train_ix, :], X.iloc[test_ix, :]
Y_train, y_test = y[train_ix], y[test_ix]
# configure the cross-validation procedure for the inner loop
cv_inner = KFold(n_splits=n_InnerSplits, shuffle=True, random_state=1)
# define the model
model = XGBRegressor(**params)
# execute search
grid_search = GridSearchCV(model, param_grid, scoring="neg_root_mean_squared_error", n_jobs=-1, cv=cv_inner)
grid_result = grid_search.fit(X_train, Y_train)
# get the best performing model fit on the whole training set
best_model = grid_result.best_estimator_
# evaluate model on the hold out dataset
predictions = best_model.predict(X_test)
R2 = r2_score(y_test, predictions) # (ytrue,ypredict)
MAE = mean_absolute_error(y_test, predictions)
# store the result
outer_resultsR2.append(R2)
outer_resultsRMSE.append(grid_result.best_score_)
outer_resultsMAE.append(MAE)
# update params based on the number of hyperparameters used
if Param3 is None:
if Param2 is not None:
print('Tuning with 2 hyperparameters')
Best_Param1.append(grid_result.best_params_[Param1])
Best_Param2.append(grid_result.best_params_[Param2])
case = 2
else:
Best_Param1.append(grid_result.best_params_[Param1])
print('Tuning with 1 hyperparameter')
case = 1
else:
print('Tuning with 3 hyperparameters')
Best_Param1.append(grid_result.best_params_[Param1])
Best_Param2.append(grid_result.best_params_[Param2])
Best_Param3.append(grid_result.best_params_[Param3])
case = 3
# print progress and results
count += 1
progress = count / n_OuterSplits
print('>Count=%.0f, R2=%.3f, RMSE=%.3f, MAE=%.3f,cfg=%s, Progress Completed=%.2f%%' % (
count, R2, grid_result.best_score_, MAE, grid_result.best_params_, progress * 100))
# End of Inner loop
# Outer loop: Summarize the estimated performance of the model
print("=================Nested CV results for hyperparamtre tuning==================")
print('Generalization error: mean-R2 (stdev): %.3f (%.3f)' % (mean(outer_resultsR2), std(outer_resultsR2)))
print('Mean MAE: %.3f (%.3f)' % (mean(outer_resultsMAE), std(outer_resultsMAE)))
print('Mean RMSE: %.3f (%.3f)' % (mean(outer_resultsRMSE), std(outer_resultsRMSE)))
# Find the best model
index_max = np.argmax(outer_resultsRMSE)
print('Best params taken at model with minimum RMSE at count: %.0f ' % (index_max + 1))
Generalized_NestedCV = pd.DataFrame(index=['Generalized Nested CV'],
columns=['meanR2', 'Std_meanR2', 'meanMAE', 'Std_meanMAE', 'meanRMSE',
'meanRMSE'])
Generalized_NestedCV.iloc[0, 0] = mean(outer_resultsR2)
Generalized_NestedCV.iloc[0, 1] = std(outer_resultsR2)
Generalized_NestedCV.iloc[0, 2] = mean(outer_resultsMAE)
Generalized_NestedCV.iloc[0, 3] = std(outer_resultsMAE)
Generalized_NestedCV.iloc[0, 4] = mean(outer_resultsRMSE)
Generalized_NestedCV.iloc[0, 5] = std(outer_resultsRMSE)
print(Generalized_NestedCV)
# Outer loop generalization results export
if path_save is False:
Generalized_NestedCV.to_csv('Generalized_NestedCV_Stats.csv', index=True)
else:
Generalized_NestedCV.to_csv(path_save + 'Generalized_NestedCV_Stats.csv', index=True)
# save tuned params
if case == 3:
TunedParams = {
Param1: Best_Param1[index_max],
Param2: Best_Param2[index_max],
Param3: Best_Param3[index_max],
}
elif case == 2:
TunedParams = {
Param1: Best_Param1[index_max],
Param2: Best_Param2[index_max],
}
else:
TunedParams = {
Param1: Best_Param1[index_max],
}
params.update(TunedParams)
result = {}
result['Params'] = params
result['TunedParams'] = TunedParams
result['Stats'] = Generalized_NestedCV
Output_NestedCV = result
print('Best hyperparamter values:')
print(params)
print('Results have been saved in csv format at the specified path')
print("=============================================================================")
return params, Output_NestedCV
[docs]
def global_xgb(X, y, params, feat_importance='gain', test_size=0.33, seed=7, path_save=False):
""" Calculates global XGBoost
:param X: dataframe with the independent variables values
:param y: dataframe with the dependent variable values
:param params: hyperparameter values. Type:dictionary (NestedCV can be used to produce params)
:param feat_importance: type of feature importance: 'gain',weight’,cover’,‘total gain’,‘total cover’.Default='gain'
:param test_size: size test (%). Default=0.33.
:param seed: seed value.Default=7
:param path_save: output folder. Default=False.
:return: global xgboost performance
"""
eval_metric = ["mae", "rmse"]
# Split data into train and test sets
X_train, X_test, Y_train, y_test = train_test_split(X, y, test_size=test_size,
random_state=seed)
model = XGBRegressor(**params, importance_type=feat_importance, eval_metric=eval_metric)
eval_set = [(X_train, Y_train), (X_test, y_test)]
# Train model
model.fit(X_train, Y_train, eval_set=eval_set, verbose=False)
# Importance
importance = (pd.DataFrame(model.feature_importances_)).T
importance.columns = X.columns[:] # (VarNames)
# Predictions
Predictions_Test = model.predict(X_test) # Make predictions for test data
# evaluate prediction
R2 = r2_score(y_test, Predictions_Test) # (ytrue ypredict)
# Output Results for Test set (OOB)
y_true = y_test
y_pred = Predictions_Test
R2test = r2_score(y_true, y_pred)
MAEtest = mean_absolute_error(y_true, y_pred)
RMSEtest = root_mean_squared_error(y_true, y_pred)
# Output Results for Prediction set [We use complete X to Predict Y - expected overestimations]
Predictions_Full = model.predict(X)
y_true = y
y_pred = Predictions_Full
R2pred = r2_score(y_true, y_pred)
GM_Res = y_true - y_pred
MAEpred = mean_absolute_error(y_true, y_pred)
RMSEpred = root_mean_squared_error(y_true, y_pred)
GMDict = {'y': y_true, 'GM_yPred': y_pred, 'GM_Res': GM_Res}
Global_XGB = pd.DataFrame(GMDict)
# Output stats
Global_XGB_Stats = pd.DataFrame(index=['GlobalXGB'],
columns=['R2Pred', 'MAEPred', 'RMSPred', 'R2test', 'MAEtest', 'RMSETest'])
Global_XGB_Stats.iloc[0, 0] = R2pred
Global_XGB_Stats.iloc[0, 1] = MAEpred
Global_XGB_Stats.iloc[0, 2] = RMSEpred
Global_XGB_Stats.iloc[0, 3] = R2test
Global_XGB_Stats.iloc[0, 4] = MAEtest
Global_XGB_Stats.iloc[0, 5] = RMSEtest
Global_XGB_Stats.reset_index(drop=True, inplace=True)
importance.reset_index(drop=True, inplace=True)
importance.columns = X.add_prefix('Imp_').columns[:]
Global_XGB_Stats = pd.concat([Global_XGB_Stats, importance], axis=1)
Global_XGB_Stats = Global_XGB_Stats.rename(index={0: 'GlobalXGB'})
print("=================XGBoost (global) evaluation results ========================")
print("Global feature importance:")
print(importance)
print("Test R2: %.2f%%" % (R2 * 100.0))
print("===================Stats and importance======================================")
print(Global_XGB_Stats)
print('Results have been saved in xlsx format at the specified path')
print("=============================================================================")
txt1 = [
'Pred metrics refer to the full set (after training, prediction using all data). Used only for reference. | Test metrics refer to the test set. | Imp_ refers to feature importance ']
Text1 = pd.DataFrame(txt1, columns=['Note:'])
txt2 = ['Grekousis G. (2025). Geographical-XGBoost: A new ensemble model for spatially local regression based on gradient-boosted trees. Journal of Geographical Systems, https://doi.org/10.1007/s10109-025-00465-4']
Text2 = pd.DataFrame(txt2, columns=['Citation:'])
if path_save is False:
with pd.ExcelWriter('GlobalXGB.xlsx') as writer:
Global_XGB_Stats.to_excel(writer, sheet_name='Stats', index=True)
Global_XGB.to_excel(writer, sheet_name='Predict', index=False)
Text1.to_excel(writer, sheet_name='Stats', startrow=5, index=False)
Text2.to_excel(writer, sheet_name='Stats', startrow=10, index=False)
else:
with pd.ExcelWriter(path_save + 'GlobalXGB.xlsx') as writer:
Global_XGB_Stats.to_excel(writer, sheet_name='Stats', index=True)
Global_XGB.to_excel(writer, sheet_name='Predict', index=False)
Text1.to_excel(writer, sheet_name='Stats', startrow=5, index=False)
Text2.to_excel(writer, sheet_name='Stats', startrow=10, index=False)
result = {}
result['Predictions'] = Global_XGB
result['Stats'] = Global_XGB_Stats
result['Importance'] = importance
Output_GlobalXGBoost = result
return Output_GlobalXGBoost
[docs]
def optimize_bw(X, y, Coords, params, bw_min, bw_max, step=1, Kernel='Adaptive', spatial_weights=True, n_splits=3,
path_save=False):
""" Finds optimal bandwidth value for defining spatial kernels
Examples:
>>> optimize_bw(X,y, Coords, params, bw_min=30, bw_max=100,step=10)
:param X: dataframe with the independent variables values
:param y: dataframe with the dependent variable values
:param Coords: dataframe with the coordinates of spatial units
:param params: hyperparameter values
:param bw_min: min bandwidth value
:param bw_max: max bandwidth value
:param step: incremental step. Default=1.
:param Kernel: 'Adaptive' or 'Fixed' kernel type to be used. Default= 'Adaptive'.
:param spatial_weights: spatial weights matrix. Default= True.
:param n_splits: k-fold grid CV number of split, Default=3.
:param path_save: output folder. Default=False.
:return: optimal bandwidth value
"""
reg_Alpha = []
reg_Alpha.insert(0, params['reg_alpha'])
param_grid = { # creates a pseudo grid
'reg_alpha': reg_Alpha
}
scoring = ['neg_root_mean_squared_error', 'r2']
print("=================Calculating optimal bandwidth===============================")
if spatial_weights is False:
print('Calculation without spatial weights')
else:
print('Calculation with spatial weights')
if Kernel == 'Adaptive':
print('Adaptive Kernel used')
else:
print('Fixed kernel ')
bwR2 = []
bwMAE = []
bwRMSE = []
bwIndex = []
bwCV = []
num_rows = len(X) # number of spatial units
DistanceMatrix_ij = pd.DataFrame(distance_matrix(Coords, Coords)) # Distance matrix nXn
# loop for bandwidth values
for b in range(bw_min, bw_max + 1, step):
print('Calculating bw= %.0i, with bw_max=%.0i and step of %.0i' % (b, bw_max, step))
bw = b
listIDs = []
yt = [] # ytrue value
LM_yOOB = [] # save central Y oob
LM_ResOOB = [] # calculate the residual of ytrue central with yOOB
# loop for spatial units
for i in range(num_rows): # the spatial unit to be calculated
Neigbours = pd.DataFrame(DistanceMatrix_ij.iloc[:, i])
Neigbours.columns = ['Distance']
Data = pd.concat([X, y, Neigbours], axis=1)
DataSorted = Data.sort_values(by=['Distance'])
XcentralOOB = pd.DataFrame(DataSorted.iloc[0, : -2]) # For estimating OOB error of the central point
YcentralOOB = DataSorted.iloc[0, -2]
# Spatial weights
if Kernel == 'Adaptive':
knn = bw
LocalData = DataSorted.iloc[1:knn + 1, :] # Local data without the central spatial unit (OOB).
LocalX = LocalData.iloc[:, : -2]
LocalY = LocalData.iloc[:, -2] # Keep only the dependent y
h = max(LocalData.Distance)
SpatialWeights = (1 - (LocalData.Distance.pow(2) / h ** 2)).pow(
2) # Adaptive bi-square (when we set number of nearest neighbours)
else: # fixed kernel
LocalData = DataSorted[DataSorted.Distance < bw]
LocalData = LocalData.iloc[1:, :] # Removes central spatial unit
LocalX = LocalData.iloc[:, : -2]
LocalY = LocalData.iloc[:, -2]
h = bw
SpatialWeights = (1 - (LocalData.Distance.pow(2) / h ** 2)).pow(
2) # Fixed bi-square (when we set a distance threshold value)
# train model through grid CV (used to assign spatial weights if selected)
model = XGBRegressor(**params)
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring=scoring, refit='neg_root_mean_squared_error',
return_train_score=True, n_jobs=-1, cv=kfold)
if spatial_weights is False:
grid_result = grid_search.fit(LocalX, LocalY)
else:
grid_result = grid_search.fit(LocalX, LocalY, sample_weight=SpatialWeights)
# get the best performing model fit on the whole training set
best_model = grid_result.best_estimator_
# Y central OOB
yOOB = best_model.predict(XcentralOOB.T)
LM_yOOB.append(yOOB[0])
ResOOB = YcentralOOB - yOOB[0]
LM_ResOOB.append(ResOOB)
listIDs.append(i)
yt.append(YcentralOOB)
# end loop for spatial units
LMDict = {'IDS': listIDs, 'y': yt, 'LM_yOOB': LM_yOOB,
'LM_ResOOB': LM_ResOOB}
LMResults = pd.DataFrame(LMDict)
y_true = LMResults.iloc[:, 1]
y_pred = LMResults.iloc[:, 2]
R2test = r2_score(y_true, y_pred)
MAEtest = mean_absolute_error(y_true, y_pred)
RMSEtest = root_mean_squared_error(y_true, y_pred)
CV = ((LMResults.iloc[:, 3]).pow(2)).sum() # CV=Σ(ytrue-yoob)^2
bwR2.append(R2test)
bwMAE.append(MAEtest)
bwRMSE.append(RMSEtest)
bwIndex.append(bw)
bwCV.append(CV)
print('bw= %.0i, with CV= %.3f' % (b, CV))
# end loop for bandwidth value
# Save data to dataframe end csv
BWDict = {'BW': bwIndex, 'R2': bwR2, 'MAE': bwMAE, 'RMSE': bwRMSE, 'CV': bwCV}
BW_results = pd.DataFrame(BWDict)
CVmin = min(BW_results.CV)
idx = BW_results[['CV']].idxmin()
BW_optCV = (BW_results.iloc[idx, 0]).values[0]
print('Best bandwidth value: %i at min CV= %.3f.' % (BW_optCV, CVmin))
print('Results have been saved in csv format at the specified path')
print("=============================================================================")
if path_save is False:
BW_results.to_csv('BW_results.csv', index=True)
else:
BW_results.to_csv(path_save + 'BW_results.csv', index=True)
return BW_optCV
[docs]
def gxgb(X, y, Coords, params, bw, Kernel='Adaptive', spatial_weights=False, feat_importance='gain',
alpha_wt_type='varying', alpha_wt=1, test_size=0.30, seed=7, n_splits=5, path_save=False):
""" Implements GeoXGBoost
:param X: dataframe with the independent variables values
:param y: dataframe with the dependent variable values
:param Coords: dataframe with the coordinates of spatial units
:param params: hyperparameter values
:param bw: bandwidth value
:param Kernel: 'Adaptive' or 'Fixed' kernel type to be used. Default= 'Adaptive'.
:param spatial_weights: spatial weights matrix. Default= True.
:param feat_importance: type of feature importance. Available methods: 'gain',weight’,cover’,‘total gain’,‘total cover’.Default='gain'
:param alpha_wt_type: type of alpha_wt. Available methods: 'varying', fixed’. Default='varying'
:param alpha_wt: aplha weight value. It takes values between 0 and 1. Default=1.
:param test_size: size test (%). Default=0.33.
:param seed: seed value. Default=7.
:param n_splits: k-fold grid CV number of split, Default=5.
:param path_save: output folder. Default=False.
:return: local prediction and related statistics
"""
# Initialize variabes
reg_Alpha = []
reg_Alpha.insert(0, params['reg_alpha'])
param_grid = { # creates a pseudo grid, needed for spatial weights
'reg_alpha': reg_Alpha
}
listIDs = []
yt = [] # Saves the y true value
LM_yPred = [] # Saves the predicted value of central when included in training
LM_yOOB = [] # Saves Central Y oob
LM_ResOOB = [] # Calculates the residual of y true central with yOOB
LocalRsqr = [] # Save local Rsqr of test set
LM_Best_params = [] # Saves best params of every local model
LM_Best_score = [] # Saves best score of every local model during training phase
LM_Best_importance = [] # Importance from best model
bestLocalModel = [] # Best local model. Used also in PredictGXGB function
y_G_hat = [] # Saves the prediction of XcentralOOB of the global model trained without the OOB
y_comb = [] # Saves the combined of local yhat+ global yhat: alpha_wt * yOOB + (1-alpha_wt) * yGhat
LocalAlpha_wt = [] # Saves the alpha_wt per local model
# caclulate distance matrix
DistanceMatrix_ij = pd.DataFrame(distance_matrix(Coords, Coords)) # Distance matrix nXn
# Calculate the number of spatial units and the number of independent variable
num_rows = len(X)
num_cols = len(X.columns)
# Scoring method
scoring = ['neg_root_mean_squared_error', 'r2']
# checks
if spatial_weights is False:
if alpha_wt < 1:
raise ValueError('alpha_wt should be 1 and alpha_wt_type should be fixed if spatial_weights is False.')
print("=================Calculating Geographical XGBOOST============================")
# ---------------Start loop for every local model ---------
for i in range(num_rows):
print('Calculating i= %.0i from a total of %.0i' % (i + 1, num_rows))
Neigbours = pd.DataFrame(DistanceMatrix_ij.iloc[:, i])
Neigbours.columns = ['Distance']
Data = pd.concat([X, y, Neigbours], axis=1)
DataSorted = Data.sort_values(by=['Distance'])
XcentralOOB = pd.DataFrame(DataSorted.iloc[0, : -2]) # For estimating OOB error of the cetral point
YcentralOOB = DataSorted.iloc[0, -2]
# Kernel weights
if Kernel == 'Adaptive':
knn = bw
LocalData = DataSorted.iloc[1:knn + 1, :] # Local data without the central spatial unit (OOB).
LocalX = LocalData.iloc[:, : -2]
LocalY = LocalData.iloc[:, -2] # Keep only the dependent y
# For predict results. All data are used- no OOB
LocalDataFull = DataSorted.iloc[:knn + 1, :] # Local data with the central sptatial unit (Pred).
LocalXFull = LocalDataFull.iloc[:, : -2]
LocalYFull = LocalDataFull.iloc[:, -2] # Keep only the dependent y
# Calculate spatial weights: Adaptive bi-square (for number of nearest neighbours)
h = max(LocalData.Distance)
SpatialWeights = (1 - (LocalData.Distance.pow(2) / h ** 2)).pow(2)
print('Adaptive Kernel used with %i neighbours' % knn)
else:
LocalData = DataSorted[DataSorted.Distance < bw]
LocalData = LocalData.iloc[1:, :] # Removes central spatial unit
LocalX = LocalData.iloc[:, : -2]
LocalY = LocalData.iloc[:, -2]
# For predict results
LocalDataFull = DataSorted.iloc[:, :] # Local data with the central spatial unit (Pred).
LocalXFull = LocalDataFull.iloc[:, : -2]
LocalYFull = LocalDataFull.iloc[:, -2] # Keep only the dependent y
# Calculate spatial weights: Fixed bi-square (for distance threshold value)
h = bw
SpatialWeights = (1 - (LocalData.Distance.pow(2) / h ** 2)).pow(2)
print('Fixed kernel with distance value %.00f' % bw)
# ------------------Model run ---------------------
model = XGBRegressor(**params, importance_type=feat_importance)
# Run Grid CV
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring=scoring, refit='neg_root_mean_squared_error',
return_train_score=True, n_jobs=-1, cv=kfold)
if spatial_weights is False:
print('Calculation without spatial weights')
grid_result = grid_search.fit(LocalX, LocalY)
else:
print('Calculation with spatial weights')
grid_result = grid_search.fit(LocalX, LocalY, sample_weight=SpatialWeights)
# get the best performing model fit on the training set
best_model = grid_result.best_estimator_
bestLocalModel.append(best_model)
LM_Best_params.append(grid_result.best_params_)
LM_Best_score.append(grid_result.best_score_)
LM_Best_importance.append(best_model.feature_importances_)
# calculate LMR2 at the test set of CV
R2means = np.max(grid_result.cv_results_['mean_test_r2'])
# Y central OOB
yOOB = best_model.predict(XcentralOOB.T)
LM_yOOB.append(yOOB[0])
ResOOB = YcentralOOB - yOOB[0]
LM_ResOOB.append(ResOOB)
# Local Rsq Train
LocalRsqr.append(R2means) # Saves the R2 of the (i) local model
listIDs.append(i)
yt.append(YcentralOOB)
# Y central Predict (for predict statistcs)
modelFull = XGBRegressor(**params, importance_type=feat_importance)
modelFull.fit(LocalXFull, LocalYFull)
yCentralPred = modelFull.predict(XcentralOOB.T)
LM_yPred.append(yCentralPred[0])
# ----------------------Ensemble-----------------------
# Calcluates the Global model without the OOB
X1 = X.drop(i) # Creates a set for the global model without the OOB
y1 = y.drop(i)
eval_metric = ["mae", "rmse"]
# Split data into train and test sets
X_train, X_test, Y_train, y_test = train_test_split(X1, y1, test_size=test_size,
random_state=seed)
model1 = XGBRegressor(**params, importance_type=feat_importance, eval_metric=eval_metric)
eval_set = [(X_train, Y_train), (X_test, y_test)]
# Train model
model1.fit(X_train, Y_train, eval_set=eval_set, verbose=False)
# Prediction of OOB through the global model
yGhat = model1.predict(XcentralOOB.T)
resG = YcentralOOB - yGhat[0]
# check global and local residuals
if alpha_wt_type == 'fixed':
Alpha_wt = alpha_wt
print('Calculation with fixed alpha_wt')
if alpha_wt == 1:
print('Calculation without ensemble')
else:
print('Calculation with ensemble')
else:
if alpha_wt == 1:
print('Calculation without ensemble')
Alpha_wt = alpha_wt
else:
print('Calculation with ensemble and varying alpha_wt')
if abs(resG) > abs(ResOOB):
Alpha_wt = 1
else:
Alpha_wt = alpha_wt
b = 1 - Alpha_wt # controls the weight of local vs global model. [Alpha_wt] refers to the local model
print('a= %.2f, b=%.2f' % (Alpha_wt, b))
ycomb = Alpha_wt * yOOB + b * yGhat # Combines local and global predictions
y_G_hat.append(yGhat[0]) # Prediction of OOB through the global model
y_comb.append(ycomb[0])
LocalAlpha_wt.append(Alpha_wt)
# ---------------End loop-------------------------------------------------
# ------------Local models and local fearure importance -----------------
LMDict = {'IDS': listIDs, 'y': yt, 'LM_yPred': LM_yPred, 'LM_yOOB': LM_yOOB, 'LM_ResOOB': LM_ResOOB,
'LMRsqr': LocalRsqr, 'LM_Best_score(RMSE)': LM_Best_score, 'alpha_wt': LocalAlpha_wt, 'yGhat': y_G_hat,
'y_ensemble': y_comb}
LM_results = pd.DataFrame(LMDict)
LM_importance = pd.DataFrame(LM_Best_importance)
LM_importance.columns = X.add_prefix('Imp_').columns[:]
# finding the feature with max importance. Used to map later to GIS
maxValues = LM_importance.max(axis=1)
MaxValues = pd.DataFrame(maxValues, columns=['MaxImportance'])
# finds the column position (feature)
column_id = LM_importance.columns.get_indexer(LM_importance.apply('idxmax', axis=1)) + 1
Column_id = pd.DataFrame(column_id, columns=['MaxFeatureID'])
LM_importance = pd.concat([LM_importance, MaxValues, Column_id], axis=1)
LM_results2Excel = pd.concat([LM_results, LM_importance], axis=1)
# ----Output results of aggregated local models to compare with global set ---------------
# Output Results for Predict set (only used for reference)
y_true = LM_results.iloc[:, 1]
y_pred = LM_results.iloc[:, 2]
R2pred = r2_score(y_true, y_pred)
MAEpred = mean_absolute_error(y_true, y_pred)
RMSEpred = root_mean_squared_error(y_true, y_pred)
# Output Results for OOB set (Local Model yOOB)
y_true = LM_results.iloc[:, 1]
y_pred = LM_results.iloc[:, 3]
R2test = r2_score(y_true, y_pred)
MAEtest = mean_absolute_error(y_true, y_pred)
RMSEtest = root_mean_squared_error(y_true, y_pred)
# output results for OOB-1 global set (global model output for OOB yGhat)
y_true = LM_results.iloc[:, 1]
y_pred = LM_results.iloc[:, 8]
R2oobGl = r2_score(y_true, y_pred)
MAEoobGl = mean_absolute_error(y_true, y_pred)
RMSEoobGl = root_mean_squared_error(y_true, y_pred)
# Output Results for combined set (y_ensemble)
y_true = LM_results.iloc[:, 1]
y_pred = LM_results.iloc[:, 9]
R2ens = r2_score(y_true, y_pred)
MAEens = mean_absolute_error(y_true, y_pred)
RMSEens = root_mean_squared_error(y_true, y_pred)
# xlsx file naming
if spatial_weights is False:
if alpha_wt == 1:
ind = ['L_GXGB']
else:
if alpha_wt == 1:
ind = ['LW_GXGB']
else:
ind = ['GXGB']
Evaluation_Results = pd.DataFrame(index=[ind],
columns=['R2_Pred', 'MAE_Pred', 'RMS_Pred', 'R2_oob', 'MAE_oob', 'RMS_oob',
'R2oobGl', 'MAEoobGl', 'RMSEoobGl', 'R2ens', 'MAEens', 'RMSEens'])
Evaluation_Results.iloc[0, 0] = R2pred
Evaluation_Results.iloc[0, 1] = MAEpred
Evaluation_Results.iloc[0, 2] = RMSEpred
Evaluation_Results.iloc[0, 3] = R2test
Evaluation_Results.iloc[0, 4] = MAEtest
Evaluation_Results.iloc[0, 5] = RMSEtest
if alpha_wt == 1:
pass
else:
Evaluation_Results.iloc[0, 6] = R2oobGl
Evaluation_Results.iloc[0, 7] = MAEoobGl
Evaluation_Results.iloc[0, 8] = RMSEoobGl
Evaluation_Results.iloc[0, 9] = R2ens
Evaluation_Results.iloc[0, 10] = MAEens
Evaluation_Results.iloc[0, 11] = RMSEens
print("=============Geographical-XGBoost Evaluation results ================")
print(Evaluation_Results)
print('Results have been saved in xlsx format at the specified path')
print("=====================================================================")
# saving hyperparameters values
params_spatial = {'Spatial Units': num_rows, 'Features': num_cols, 'Kernel': Kernel, 'Bandwidth': bw,
'Spatial Weights': spatial_weights, 'Alpha Weight type': alpha_wt_type,
'Alpha Weight value': alpha_wt, 'Feature Importance': feat_importance,
'Test Size': test_size, 'Seed': seed}
params_full = params | params_spatial
Params_full = (pd.DataFrame(data=params_full, index=['Value'])).T
file_name = (ind[0] + '.xlsx')
txt1 = [
'Pred metrics refer to the full set (after training, prediction using all data). Used only for reference. | oob metrics refer to the local model when the central point is not including. | oobGl metrics refer to the global model when the central point is not including. | ens metrics refer to the local model']
Text1 = pd.DataFrame(txt1, columns=['Notes:'])
txt3 = ['Grekousis G. (2025). Geographical-XGBoost: A new ensemble model for spatially local regression based on gradient-boosted trees. Journal of Geographical Systems, https://doi.org/10.1007/s10109-025-00465-4']
Text3 = pd.DataFrame(txt3, columns=['Citation:'])
Xls_column_labels = {'IDS': 'Spatial unit id', 'y': 'y_true',
'LM_yPred': 'Local model prediction including central point',
'LM_yOOB': 'Local model prediction excluding central point',
'LM_ResOOB': 'Local model OOB residuals', 'LMRsqr': 'Local model Rsqr (oob)',
'LM_Best_score(RMSE)': 'Local model best score (oob)', 'alpha_wt': 'alpha weight value',
'yGhat': 'Prediction of the global model excluding central point',
'y_ensemble': 'prediction of y through ensemble', 'Imp_': 'Feature local importance',
'MaxImportance': 'Maximum feature importance',
'MaxFeatureID': 'Feature with max importance'}
xls_column_labels = (pd.DataFrame(data=Xls_column_labels, index=['Description'])).T
# saving hyperparameters, statistics, and local models to xlsx
if path_save is False:
with pd.ExcelWriter(file_name) as writer:
Evaluation_Results.to_excel(writer, sheet_name='Stats', index=True)
Params_full.to_excel(writer, sheet_name='Stats', startrow=3, index=True)
Text1.to_excel(writer, sheet_name='Stats', startrow=25, index=False)
xls_column_labels.to_excel(writer, sheet_name='Stats', startrow=28, index=True)
Text3.to_excel(writer, sheet_name='Stats', startrow=43, index=False)
LM_results2Excel.to_excel(writer, sheet_name='LocalModels', index=False)
else:
with pd.ExcelWriter(path_save + file_name) as writer:
Evaluation_Results.to_excel(writer, sheet_name='Stats', index=True)
Params_full.to_excel(writer, sheet_name='Stats', startrow=3, index=True)
Text1.to_excel(writer, sheet_name='Stats', startrow=25, index=False)
xls_column_labels.to_excel(writer, sheet_name='Stats', startrow=28, index=True)
Text3.to_excel(writer, sheet_name='Stats', startrow=43, index=False)
LM_results2Excel.to_excel(writer, sheet_name='LocalModels', index=False)
# output results
result = {}
result['Params'] = Params_full
result['Stats'] = Evaluation_Results
result['Prediction'] = LM_results2Excel
result['alpha_wt'] = LocalAlpha_wt
result['y_G_hat'] = y_G_hat
result['bestLocalModel'] = bestLocalModel
Output_GXGB_LocalModel = result
return Output_GXGB_LocalModel
[docs]
def predict_gxgb(DataPredict, CoordsPredict, Coords, Output_GXGB_LocalModel, alpha_wt=0.5, alpha_wt_type='varying',
path_save=False):
""" Prediction in unseen data
:param DataPredict: dataframe containing the values of the independent variables referring to the spatial units in which the prediction will take place.
:param CoordsPredict: dataframe containing the coordinates of the spatial units in which the prediction will take place.
:param Coords: dataframe of coordinates of all spatial units that the original GXGB model was trained
:param Output_GXGB_LocalModel: the trained model that has been created through gxgb function
:param alpha_wt: the value of alpha weight. It ranges from 0 to 1. Default=0.5
:param alpha_wt_type: type of alpha_wt. Available methods: 'varying', fixed’. Default='varying'
:param path_save: output folder. Default=False.
:return: prediction in unseen data.
"""
DistanceMatrix_Predict = pd.DataFrame(distance_matrix(CoordsPredict, Coords)) # Distance matrix nXn
num_rows = len(DataPredict)
index_min = np.argmin(DistanceMatrix_Predict, axis=1)
Y_PRED = []
Alpha_wtDF = pd.DataFrame(Output_GXGB_LocalModel['alpha_wt'])
y_G_hat = pd.DataFrame(Output_GXGB_LocalModel['y_G_hat'])
bestLocalModel = Output_GXGB_LocalModel['bestLocalModel']
for i in range(num_rows):
index = index_min[i]
localModel = bestLocalModel[index]
pred = pd.DataFrame(DataPredict.iloc[i, :])
# predict y using the local model
y_predLoc = localModel.predict(pred.transpose())
# predict y using the global model
yGhat = y_G_hat.iloc[index]
# predict final
if alpha_wt_type == 'fixed':
Alpha_wt = alpha_wt
print('Calculation with fixed alpha_wt')
b = 1 - Alpha_wt # controls the weight of local vs global model. [Alpha_wt] refers to the local model
print('a= %.2f, b=%.2f' % (Alpha_wt, b))
else:
Alpha_wt = Alpha_wtDF.iloc[index]
print('Calculation with varying alpha_wt')
b = 1 - Alpha_wt # controls the weight of local vs global model. [Alpha_wt] refers to the local model
print('a= %.2f, b=%.2f' % (Alpha_wt.iloc[0], b.iloc[0]))
# Combines local and global predictions
Y_pred = Alpha_wt * y_predLoc + b * yGhat
Y_PRED.append(Y_pred[0])
Predict_results = pd.DataFrame(Y_PRED, columns=['Y_PRED'])
if path_save is False:
Predict_results.to_csv('Predict_results.csv', index=True)
else:
Predict_results.to_csv(path_save + 'Predict_results.csv', index=True)
print("=============Predict Geographical-XGBoost ==========================")
print(Predict_results)
print('Results have been saved in xlsx format at the specified path')
print("=====================================================================")
Output_PredictGXGBoost = Predict_results
return Output_PredictGXGBoost