Source code for geoxgboost.geoxgboost

""" This module implements Geographical-XGBoost for spatially local regression.

The module contains the following functions:

- `create_param_grid` - Returns the grid of up to three hyperparameters.
- `nestedCV` - Returns the optimized hyperparameters' values and generalization error of XGBoost.
- `global_xgb` - Returns the global XGBoost model.
- `optimize_bw` - Returns the optimized bandwidth value.
- `gxgb` - Returns geographical XGBoost, local prediction and related statistics.
- `predict_gxgb` - Returns prediction in unseen data.

"""
# Import libs
import numpy as np
import pandas as pd
from numpy import mean
from numpy import std
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from scipy.spatial import distance_matrix

[docs] def create_param_grid(Param1, Param1_Values, Param2=None, Param2_Values=None, Param3=None, Param3_Values=None): """ Creates a grid of up to three hyperparameters for tuning. Examples: >>> Param1='n_estimators' >>> Param1_Values = [100, 200, 300,500] >>> Param2='learning_rate' >>> Param2_Values = [0.1, 0.05,0.01] >>> Param3='max_depth' >>> Param3_Values = [2,3,4,6] >>> create_param_grid(Param1,Param1_Values,Param2,Param2_Values,Param3,Param3_Values) :param Param1: 1st hyperparameter name e.g., 'n_estimators' :param Param1_Values: values for search e.g., [100, 200, 500] :param Param2: 2nd hyperparameter name e.g., 'learning_rate'. Default=None. :param Param2_Values: values for search e.g., [0.1, 0.05,0.01]. Default=None. :param Param3: 3rd hyperparameter name e.g., 'max_depth'. Default=None. :param Param3_Values: values for search e.g., [2,3,4,6]. Default=None. :return: param_grid. Can be used in nestedCV function to fine tune hyperparameters Change the argument of Param1, Param2, or Param3 with other hyperparamters available to tune XGBoost, such as: subsample, colsample_bytree, lambda, alpha etc. For example: Param1= 'subsmample' Param1_Values = [0.5, 0.7, 0.9] A complete list of hyperparameters can be found here: https://xgboost.readthedocs.io/en/stable/parameter.html Tip: This function can be iteratively repeated with different sets of hyperparameters. See an example in GXGB_call_demo.py at the DemoGXGBoost in GitHub. """ param_grid = {} # Create params grid if Param1 is not None: param_grid[Param1] = Param1_Values if Param2 is not None: param_grid[Param2] = Param2_Values if Param3 is not None: param_grid[Param3] = Param3_Values return param_grid
[docs] def nestedCV(X, y, param_grid, Param1, Param2=None, Param3=None, params=None, path_save=False, n_OuterSplits=5, n_InnerSplits=3): """ Applies nested cross validation for tuning up to three hyperparameters and calculating model generalization error. :param X: dataframe with the independent variables values :param y: dataframe with the dependent variable values :param params: initial hyperparameter values. Type:dictionary :param param_grid: grid values - output of param_grid function :param Param1: name of 1st hyperparameter used (same as param_grid function) :param Param2: name of 2nd hyperparameter used in param_grid function. Default=None. :param Param3: name of 3rd hyperparameter used in param_grid function. Default=None. :param path_save: output folder. Default=False. :param n_OuterSplits: number of outer splits. Default=5. :param n_InnerSplits: number if inner splits Default=3. :return: optimized hyperparameters' values and generalization error of model through nestedCV """ # Default values for params for XGBoost and GXGBoost defaultparams = { 'n_estimators': 100, 'learning_rate': 0.3, 'max_depth': 6, 'min_child_weight': 1, 'gamma': 0, 'subsample': 1, 'colsample_bytree': 1, 'reg_alpha': 0, 'reg_lambda': 1, } if params is None: params = defaultparams print(params) Best_Param1 = list() Best_Param2 = list() Best_Param3 = list() count = 0 # Outer loop cv_outer = KFold(n_splits=n_OuterSplits, shuffle=True, random_state=1) outer_resultsR2 = list() outer_resultsRMSE = list() outer_resultsMAE = list() print("=================Nested CV process===========================================") # Inner loop for train_ix, test_ix in cv_outer.split(X): # split data X_train, X_test = X.iloc[train_ix, :], X.iloc[test_ix, :] Y_train, y_test = y[train_ix], y[test_ix] # configure the cross-validation procedure for the inner loop cv_inner = KFold(n_splits=n_InnerSplits, shuffle=True, random_state=1) # define the model model = XGBRegressor(**params) # execute search grid_search = GridSearchCV(model, param_grid, scoring="neg_root_mean_squared_error", n_jobs=-1, cv=cv_inner) grid_result = grid_search.fit(X_train, Y_train) # get the best performing model fit on the whole training set best_model = grid_result.best_estimator_ # evaluate model on the hold out dataset predictions = best_model.predict(X_test) R2 = r2_score(y_test, predictions) # (ytrue,ypredict) MAE = mean_absolute_error(y_test, predictions) # store the result outer_resultsR2.append(R2) outer_resultsRMSE.append(grid_result.best_score_) outer_resultsMAE.append(MAE) # update params based on the number of hyperparameters used if Param3 is None: if Param2 is not None: print('Tuning with 2 hyperparameters') Best_Param1.append(grid_result.best_params_[Param1]) Best_Param2.append(grid_result.best_params_[Param2]) case = 2 else: Best_Param1.append(grid_result.best_params_[Param1]) print('Tuning with 1 hyperparameter') case = 1 else: print('Tuning with 3 hyperparameters') Best_Param1.append(grid_result.best_params_[Param1]) Best_Param2.append(grid_result.best_params_[Param2]) Best_Param3.append(grid_result.best_params_[Param3]) case = 3 # print progress and results count += 1 progress = count / n_OuterSplits print('>Count=%.0f, R2=%.3f, RMSE=%.3f, MAE=%.3f,cfg=%s, Progress Completed=%.2f%%' % ( count, R2, grid_result.best_score_, MAE, grid_result.best_params_, progress * 100)) # End of Inner loop # Outer loop: Summarize the estimated performance of the model print("=================Nested CV results for hyperparamtre tuning==================") print('Generalization error: mean-R2 (stdev): %.3f (%.3f)' % (mean(outer_resultsR2), std(outer_resultsR2))) print('Mean MAE: %.3f (%.3f)' % (mean(outer_resultsMAE), std(outer_resultsMAE))) print('Mean RMSE: %.3f (%.3f)' % (mean(outer_resultsRMSE), std(outer_resultsRMSE))) # Find the best model index_max = np.argmax(outer_resultsRMSE) print('Best params taken at model with minimum RMSE at count: %.0f ' % (index_max + 1)) Generalized_NestedCV = pd.DataFrame(index=['Generalized Nested CV'], columns=['meanR2', 'Std_meanR2', 'meanMAE', 'Std_meanMAE', 'meanRMSE', 'meanRMSE']) Generalized_NestedCV.iloc[0, 0] = mean(outer_resultsR2) Generalized_NestedCV.iloc[0, 1] = std(outer_resultsR2) Generalized_NestedCV.iloc[0, 2] = mean(outer_resultsMAE) Generalized_NestedCV.iloc[0, 3] = std(outer_resultsMAE) Generalized_NestedCV.iloc[0, 4] = mean(outer_resultsRMSE) Generalized_NestedCV.iloc[0, 5] = std(outer_resultsRMSE) print(Generalized_NestedCV) # Outer loop generalization results export if path_save is False: Generalized_NestedCV.to_csv('Generalized_NestedCV_Stats.csv', index=True) else: Generalized_NestedCV.to_csv(path_save + 'Generalized_NestedCV_Stats.csv', index=True) # save tuned params if case == 3: TunedParams = { Param1: Best_Param1[index_max], Param2: Best_Param2[index_max], Param3: Best_Param3[index_max], } elif case == 2: TunedParams = { Param1: Best_Param1[index_max], Param2: Best_Param2[index_max], } else: TunedParams = { Param1: Best_Param1[index_max], } params.update(TunedParams) result = {} result['Params'] = params result['TunedParams'] = TunedParams result['Stats'] = Generalized_NestedCV Output_NestedCV = result print('Best hyperparamter values:') print(params) print('Results have been saved in csv format at the specified path') print("=============================================================================") return params, Output_NestedCV
[docs] def global_xgb(X, y, params, feat_importance='gain', test_size=0.33, seed=7, path_save=False): """ Calculates global XGBoost :param X: dataframe with the independent variables values :param y: dataframe with the dependent variable values :param params: hyperparameter values. Type:dictionary (NestedCV can be used to produce params) :param feat_importance: type of feature importance: 'gain',weight’,cover’,‘total gain’,‘total cover’.Default='gain' :param test_size: size test (%). Default=0.33. :param seed: seed value.Default=7 :param path_save: output folder. Default=False. :return: global xgboost performance """ eval_metric = ["mae", "rmse"] # Split data into train and test sets X_train, X_test, Y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed) model = XGBRegressor(**params, importance_type=feat_importance, eval_metric=eval_metric) eval_set = [(X_train, Y_train), (X_test, y_test)] # Train model model.fit(X_train, Y_train, eval_set=eval_set, verbose=False) # Importance importance = (pd.DataFrame(model.feature_importances_)).T importance.columns = X.columns[:] # (VarNames) # Predictions Predictions_Test = model.predict(X_test) # Make predictions for test data # evaluate prediction R2 = r2_score(y_test, Predictions_Test) # (ytrue ypredict) # Output Results for Test set (OOB) y_true = y_test y_pred = Predictions_Test R2test = r2_score(y_true, y_pred) MAEtest = mean_absolute_error(y_true, y_pred) RMSEtest = root_mean_squared_error(y_true, y_pred) # Output Results for Prediction set [We use complete X to Predict Y - expected overestimations] Predictions_Full = model.predict(X) y_true = y y_pred = Predictions_Full R2pred = r2_score(y_true, y_pred) GM_Res = y_true - y_pred MAEpred = mean_absolute_error(y_true, y_pred) RMSEpred = root_mean_squared_error(y_true, y_pred) GMDict = {'y': y_true, 'GM_yPred': y_pred, 'GM_Res': GM_Res} Global_XGB = pd.DataFrame(GMDict) # Output stats Global_XGB_Stats = pd.DataFrame(index=['GlobalXGB'], columns=['R2Pred', 'MAEPred', 'RMSPred', 'R2test', 'MAEtest', 'RMSETest']) Global_XGB_Stats.iloc[0, 0] = R2pred Global_XGB_Stats.iloc[0, 1] = MAEpred Global_XGB_Stats.iloc[0, 2] = RMSEpred Global_XGB_Stats.iloc[0, 3] = R2test Global_XGB_Stats.iloc[0, 4] = MAEtest Global_XGB_Stats.iloc[0, 5] = RMSEtest Global_XGB_Stats.reset_index(drop=True, inplace=True) importance.reset_index(drop=True, inplace=True) importance.columns = X.add_prefix('Imp_').columns[:] Global_XGB_Stats = pd.concat([Global_XGB_Stats, importance], axis=1) Global_XGB_Stats = Global_XGB_Stats.rename(index={0: 'GlobalXGB'}) print("=================XGBoost (global) evaluation results ========================") print("Global feature importance:") print(importance) print("Test R2: %.2f%%" % (R2 * 100.0)) print("===================Stats and importance======================================") print(Global_XGB_Stats) print('Results have been saved in xlsx format at the specified path') print("=============================================================================") txt1 = [ 'Pred metrics refer to the full set (after training, prediction using all data). Used only for reference. | Test metrics refer to the test set. | Imp_ refers to feature importance '] Text1 = pd.DataFrame(txt1, columns=['Note:']) txt2 = ['Grekousis G. (2025). Geographical-XGBoost: A new ensemble model for spatially local regression based on gradient-boosted trees. Journal of Geographical Systems, https://doi.org/10.1007/s10109-025-00465-4'] Text2 = pd.DataFrame(txt2, columns=['Citation:']) if path_save is False: with pd.ExcelWriter('GlobalXGB.xlsx') as writer: Global_XGB_Stats.to_excel(writer, sheet_name='Stats', index=True) Global_XGB.to_excel(writer, sheet_name='Predict', index=False) Text1.to_excel(writer, sheet_name='Stats', startrow=5, index=False) Text2.to_excel(writer, sheet_name='Stats', startrow=10, index=False) else: with pd.ExcelWriter(path_save + 'GlobalXGB.xlsx') as writer: Global_XGB_Stats.to_excel(writer, sheet_name='Stats', index=True) Global_XGB.to_excel(writer, sheet_name='Predict', index=False) Text1.to_excel(writer, sheet_name='Stats', startrow=5, index=False) Text2.to_excel(writer, sheet_name='Stats', startrow=10, index=False) result = {} result['Predictions'] = Global_XGB result['Stats'] = Global_XGB_Stats result['Importance'] = importance Output_GlobalXGBoost = result return Output_GlobalXGBoost
[docs] def optimize_bw(X, y, Coords, params, bw_min, bw_max, step=1, Kernel='Adaptive', spatial_weights=True, n_splits=3, path_save=False): """ Finds optimal bandwidth value for defining spatial kernels Examples: >>> optimize_bw(X,y, Coords, params, bw_min=30, bw_max=100,step=10) :param X: dataframe with the independent variables values :param y: dataframe with the dependent variable values :param Coords: dataframe with the coordinates of spatial units :param params: hyperparameter values :param bw_min: min bandwidth value :param bw_max: max bandwidth value :param step: incremental step. Default=1. :param Kernel: 'Adaptive' or 'Fixed' kernel type to be used. Default= 'Adaptive'. :param spatial_weights: spatial weights matrix. Default= True. :param n_splits: k-fold grid CV number of split, Default=3. :param path_save: output folder. Default=False. :return: optimal bandwidth value """ reg_Alpha = [] reg_Alpha.insert(0, params['reg_alpha']) param_grid = { # creates a pseudo grid 'reg_alpha': reg_Alpha } scoring = ['neg_root_mean_squared_error', 'r2'] print("=================Calculating optimal bandwidth===============================") if spatial_weights is False: print('Calculation without spatial weights') else: print('Calculation with spatial weights') if Kernel == 'Adaptive': print('Adaptive Kernel used') else: print('Fixed kernel ') bwR2 = [] bwMAE = [] bwRMSE = [] bwIndex = [] bwCV = [] num_rows = len(X) # number of spatial units DistanceMatrix_ij = pd.DataFrame(distance_matrix(Coords, Coords)) # Distance matrix nXn # loop for bandwidth values for b in range(bw_min, bw_max + 1, step): print('Calculating bw= %.0i, with bw_max=%.0i and step of %.0i' % (b, bw_max, step)) bw = b listIDs = [] yt = [] # ytrue value LM_yOOB = [] # save central Y oob LM_ResOOB = [] # calculate the residual of ytrue central with yOOB # loop for spatial units for i in range(num_rows): # the spatial unit to be calculated Neigbours = pd.DataFrame(DistanceMatrix_ij.iloc[:, i]) Neigbours.columns = ['Distance'] Data = pd.concat([X, y, Neigbours], axis=1) DataSorted = Data.sort_values(by=['Distance']) XcentralOOB = pd.DataFrame(DataSorted.iloc[0, : -2]) # For estimating OOB error of the central point YcentralOOB = DataSorted.iloc[0, -2] # Spatial weights if Kernel == 'Adaptive': knn = bw LocalData = DataSorted.iloc[1:knn + 1, :] # Local data without the central spatial unit (OOB). LocalX = LocalData.iloc[:, : -2] LocalY = LocalData.iloc[:, -2] # Keep only the dependent y h = max(LocalData.Distance) SpatialWeights = (1 - (LocalData.Distance.pow(2) / h ** 2)).pow( 2) # Adaptive bi-square (when we set number of nearest neighbours) else: # fixed kernel LocalData = DataSorted[DataSorted.Distance < bw] LocalData = LocalData.iloc[1:, :] # Removes central spatial unit LocalX = LocalData.iloc[:, : -2] LocalY = LocalData.iloc[:, -2] h = bw SpatialWeights = (1 - (LocalData.Distance.pow(2) / h ** 2)).pow( 2) # Fixed bi-square (when we set a distance threshold value) # train model through grid CV (used to assign spatial weights if selected) model = XGBRegressor(**params) kfold = KFold(n_splits=n_splits, shuffle=True, random_state=7) grid_search = GridSearchCV(model, param_grid, scoring=scoring, refit='neg_root_mean_squared_error', return_train_score=True, n_jobs=-1, cv=kfold) if spatial_weights is False: grid_result = grid_search.fit(LocalX, LocalY) else: grid_result = grid_search.fit(LocalX, LocalY, sample_weight=SpatialWeights) # get the best performing model fit on the whole training set best_model = grid_result.best_estimator_ # Y central OOB yOOB = best_model.predict(XcentralOOB.T) LM_yOOB.append(yOOB[0]) ResOOB = YcentralOOB - yOOB[0] LM_ResOOB.append(ResOOB) listIDs.append(i) yt.append(YcentralOOB) # end loop for spatial units LMDict = {'IDS': listIDs, 'y': yt, 'LM_yOOB': LM_yOOB, 'LM_ResOOB': LM_ResOOB} LMResults = pd.DataFrame(LMDict) y_true = LMResults.iloc[:, 1] y_pred = LMResults.iloc[:, 2] R2test = r2_score(y_true, y_pred) MAEtest = mean_absolute_error(y_true, y_pred) RMSEtest = root_mean_squared_error(y_true, y_pred) CV = ((LMResults.iloc[:, 3]).pow(2)).sum() # CV=Σ(ytrue-yoob)^2 bwR2.append(R2test) bwMAE.append(MAEtest) bwRMSE.append(RMSEtest) bwIndex.append(bw) bwCV.append(CV) print('bw= %.0i, with CV= %.3f' % (b, CV)) # end loop for bandwidth value # Save data to dataframe end csv BWDict = {'BW': bwIndex, 'R2': bwR2, 'MAE': bwMAE, 'RMSE': bwRMSE, 'CV': bwCV} BW_results = pd.DataFrame(BWDict) CVmin = min(BW_results.CV) idx = BW_results[['CV']].idxmin() BW_optCV = (BW_results.iloc[idx, 0]).values[0] print('Best bandwidth value: %i at min CV= %.3f.' % (BW_optCV, CVmin)) print('Results have been saved in csv format at the specified path') print("=============================================================================") if path_save is False: BW_results.to_csv('BW_results.csv', index=True) else: BW_results.to_csv(path_save + 'BW_results.csv', index=True) return BW_optCV
[docs] def gxgb(X, y, Coords, params, bw, Kernel='Adaptive', spatial_weights=False, feat_importance='gain', alpha_wt_type='varying', alpha_wt=1, test_size=0.30, seed=7, n_splits=5, path_save=False): """ Implements GeoXGBoost :param X: dataframe with the independent variables values :param y: dataframe with the dependent variable values :param Coords: dataframe with the coordinates of spatial units :param params: hyperparameter values :param bw: bandwidth value :param Kernel: 'Adaptive' or 'Fixed' kernel type to be used. Default= 'Adaptive'. :param spatial_weights: spatial weights matrix. Default= True. :param feat_importance: type of feature importance. Available methods: 'gain',weight’,cover’,‘total gain’,‘total cover’.Default='gain' :param alpha_wt_type: type of alpha_wt. Available methods: 'varying', fixed’. Default='varying' :param alpha_wt: aplha weight value. It takes values between 0 and 1. Default=1. :param test_size: size test (%). Default=0.33. :param seed: seed value. Default=7. :param n_splits: k-fold grid CV number of split, Default=5. :param path_save: output folder. Default=False. :return: local prediction and related statistics """ # Initialize variabes reg_Alpha = [] reg_Alpha.insert(0, params['reg_alpha']) param_grid = { # creates a pseudo grid, needed for spatial weights 'reg_alpha': reg_Alpha } listIDs = [] yt = [] # Saves the y true value LM_yPred = [] # Saves the predicted value of central when included in training LM_yOOB = [] # Saves Central Y oob LM_ResOOB = [] # Calculates the residual of y true central with yOOB LocalRsqr = [] # Save local Rsqr of test set LM_Best_params = [] # Saves best params of every local model LM_Best_score = [] # Saves best score of every local model during training phase LM_Best_importance = [] # Importance from best model bestLocalModel = [] # Best local model. Used also in PredictGXGB function y_G_hat = [] # Saves the prediction of XcentralOOB of the global model trained without the OOB y_comb = [] # Saves the combined of local yhat+ global yhat: alpha_wt * yOOB + (1-alpha_wt) * yGhat LocalAlpha_wt = [] # Saves the alpha_wt per local model # caclulate distance matrix DistanceMatrix_ij = pd.DataFrame(distance_matrix(Coords, Coords)) # Distance matrix nXn # Calculate the number of spatial units and the number of independent variable num_rows = len(X) num_cols = len(X.columns) # Scoring method scoring = ['neg_root_mean_squared_error', 'r2'] # checks if spatial_weights is False: if alpha_wt < 1: raise ValueError('alpha_wt should be 1 and alpha_wt_type should be fixed if spatial_weights is False.') print("=================Calculating Geographical XGBOOST============================") # ---------------Start loop for every local model --------- for i in range(num_rows): print('Calculating i= %.0i from a total of %.0i' % (i + 1, num_rows)) Neigbours = pd.DataFrame(DistanceMatrix_ij.iloc[:, i]) Neigbours.columns = ['Distance'] Data = pd.concat([X, y, Neigbours], axis=1) DataSorted = Data.sort_values(by=['Distance']) XcentralOOB = pd.DataFrame(DataSorted.iloc[0, : -2]) # For estimating OOB error of the cetral point YcentralOOB = DataSorted.iloc[0, -2] # Kernel weights if Kernel == 'Adaptive': knn = bw LocalData = DataSorted.iloc[1:knn + 1, :] # Local data without the central spatial unit (OOB). LocalX = LocalData.iloc[:, : -2] LocalY = LocalData.iloc[:, -2] # Keep only the dependent y # For predict results. All data are used- no OOB LocalDataFull = DataSorted.iloc[:knn + 1, :] # Local data with the central sptatial unit (Pred). LocalXFull = LocalDataFull.iloc[:, : -2] LocalYFull = LocalDataFull.iloc[:, -2] # Keep only the dependent y # Calculate spatial weights: Adaptive bi-square (for number of nearest neighbours) h = max(LocalData.Distance) SpatialWeights = (1 - (LocalData.Distance.pow(2) / h ** 2)).pow(2) print('Adaptive Kernel used with %i neighbours' % knn) else: LocalData = DataSorted[DataSorted.Distance < bw] LocalData = LocalData.iloc[1:, :] # Removes central spatial unit LocalX = LocalData.iloc[:, : -2] LocalY = LocalData.iloc[:, -2] # For predict results LocalDataFull = DataSorted.iloc[:, :] # Local data with the central spatial unit (Pred). LocalXFull = LocalDataFull.iloc[:, : -2] LocalYFull = LocalDataFull.iloc[:, -2] # Keep only the dependent y # Calculate spatial weights: Fixed bi-square (for distance threshold value) h = bw SpatialWeights = (1 - (LocalData.Distance.pow(2) / h ** 2)).pow(2) print('Fixed kernel with distance value %.00f' % bw) # ------------------Model run --------------------- model = XGBRegressor(**params, importance_type=feat_importance) # Run Grid CV kfold = KFold(n_splits=n_splits, shuffle=True, random_state=7) grid_search = GridSearchCV(model, param_grid, scoring=scoring, refit='neg_root_mean_squared_error', return_train_score=True, n_jobs=-1, cv=kfold) if spatial_weights is False: print('Calculation without spatial weights') grid_result = grid_search.fit(LocalX, LocalY) else: print('Calculation with spatial weights') grid_result = grid_search.fit(LocalX, LocalY, sample_weight=SpatialWeights) # get the best performing model fit on the training set best_model = grid_result.best_estimator_ bestLocalModel.append(best_model) LM_Best_params.append(grid_result.best_params_) LM_Best_score.append(grid_result.best_score_) LM_Best_importance.append(best_model.feature_importances_) # calculate LMR2 at the test set of CV R2means = np.max(grid_result.cv_results_['mean_test_r2']) # Y central OOB yOOB = best_model.predict(XcentralOOB.T) LM_yOOB.append(yOOB[0]) ResOOB = YcentralOOB - yOOB[0] LM_ResOOB.append(ResOOB) # Local Rsq Train LocalRsqr.append(R2means) # Saves the R2 of the (i) local model listIDs.append(i) yt.append(YcentralOOB) # Y central Predict (for predict statistcs) modelFull = XGBRegressor(**params, importance_type=feat_importance) modelFull.fit(LocalXFull, LocalYFull) yCentralPred = modelFull.predict(XcentralOOB.T) LM_yPred.append(yCentralPred[0]) # ----------------------Ensemble----------------------- # Calcluates the Global model without the OOB X1 = X.drop(i) # Creates a set for the global model without the OOB y1 = y.drop(i) eval_metric = ["mae", "rmse"] # Split data into train and test sets X_train, X_test, Y_train, y_test = train_test_split(X1, y1, test_size=test_size, random_state=seed) model1 = XGBRegressor(**params, importance_type=feat_importance, eval_metric=eval_metric) eval_set = [(X_train, Y_train), (X_test, y_test)] # Train model model1.fit(X_train, Y_train, eval_set=eval_set, verbose=False) # Prediction of OOB through the global model yGhat = model1.predict(XcentralOOB.T) resG = YcentralOOB - yGhat[0] # check global and local residuals if alpha_wt_type == 'fixed': Alpha_wt = alpha_wt print('Calculation with fixed alpha_wt') if alpha_wt == 1: print('Calculation without ensemble') else: print('Calculation with ensemble') else: if alpha_wt == 1: print('Calculation without ensemble') Alpha_wt = alpha_wt else: print('Calculation with ensemble and varying alpha_wt') if abs(resG) > abs(ResOOB): Alpha_wt = 1 else: Alpha_wt = alpha_wt b = 1 - Alpha_wt # controls the weight of local vs global model. [Alpha_wt] refers to the local model print('a= %.2f, b=%.2f' % (Alpha_wt, b)) ycomb = Alpha_wt * yOOB + b * yGhat # Combines local and global predictions y_G_hat.append(yGhat[0]) # Prediction of OOB through the global model y_comb.append(ycomb[0]) LocalAlpha_wt.append(Alpha_wt) # ---------------End loop------------------------------------------------- # ------------Local models and local fearure importance ----------------- LMDict = {'IDS': listIDs, 'y': yt, 'LM_yPred': LM_yPred, 'LM_yOOB': LM_yOOB, 'LM_ResOOB': LM_ResOOB, 'LMRsqr': LocalRsqr, 'LM_Best_score(RMSE)': LM_Best_score, 'alpha_wt': LocalAlpha_wt, 'yGhat': y_G_hat, 'y_ensemble': y_comb} LM_results = pd.DataFrame(LMDict) LM_importance = pd.DataFrame(LM_Best_importance) LM_importance.columns = X.add_prefix('Imp_').columns[:] # finding the feature with max importance. Used to map later to GIS maxValues = LM_importance.max(axis=1) MaxValues = pd.DataFrame(maxValues, columns=['MaxImportance']) # finds the column position (feature) column_id = LM_importance.columns.get_indexer(LM_importance.apply('idxmax', axis=1)) + 1 Column_id = pd.DataFrame(column_id, columns=['MaxFeatureID']) LM_importance = pd.concat([LM_importance, MaxValues, Column_id], axis=1) LM_results2Excel = pd.concat([LM_results, LM_importance], axis=1) # ----Output results of aggregated local models to compare with global set --------------- # Output Results for Predict set (only used for reference) y_true = LM_results.iloc[:, 1] y_pred = LM_results.iloc[:, 2] R2pred = r2_score(y_true, y_pred) MAEpred = mean_absolute_error(y_true, y_pred) RMSEpred = root_mean_squared_error(y_true, y_pred) # Output Results for OOB set (Local Model yOOB) y_true = LM_results.iloc[:, 1] y_pred = LM_results.iloc[:, 3] R2test = r2_score(y_true, y_pred) MAEtest = mean_absolute_error(y_true, y_pred) RMSEtest = root_mean_squared_error(y_true, y_pred) # output results for OOB-1 global set (global model output for OOB yGhat) y_true = LM_results.iloc[:, 1] y_pred = LM_results.iloc[:, 8] R2oobGl = r2_score(y_true, y_pred) MAEoobGl = mean_absolute_error(y_true, y_pred) RMSEoobGl = root_mean_squared_error(y_true, y_pred) # Output Results for combined set (y_ensemble) y_true = LM_results.iloc[:, 1] y_pred = LM_results.iloc[:, 9] R2ens = r2_score(y_true, y_pred) MAEens = mean_absolute_error(y_true, y_pred) RMSEens = root_mean_squared_error(y_true, y_pred) # xlsx file naming if spatial_weights is False: if alpha_wt == 1: ind = ['L_GXGB'] else: if alpha_wt == 1: ind = ['LW_GXGB'] else: ind = ['GXGB'] Evaluation_Results = pd.DataFrame(index=[ind], columns=['R2_Pred', 'MAE_Pred', 'RMS_Pred', 'R2_oob', 'MAE_oob', 'RMS_oob', 'R2oobGl', 'MAEoobGl', 'RMSEoobGl', 'R2ens', 'MAEens', 'RMSEens']) Evaluation_Results.iloc[0, 0] = R2pred Evaluation_Results.iloc[0, 1] = MAEpred Evaluation_Results.iloc[0, 2] = RMSEpred Evaluation_Results.iloc[0, 3] = R2test Evaluation_Results.iloc[0, 4] = MAEtest Evaluation_Results.iloc[0, 5] = RMSEtest if alpha_wt == 1: pass else: Evaluation_Results.iloc[0, 6] = R2oobGl Evaluation_Results.iloc[0, 7] = MAEoobGl Evaluation_Results.iloc[0, 8] = RMSEoobGl Evaluation_Results.iloc[0, 9] = R2ens Evaluation_Results.iloc[0, 10] = MAEens Evaluation_Results.iloc[0, 11] = RMSEens print("=============Geographical-XGBoost Evaluation results ================") print(Evaluation_Results) print('Results have been saved in xlsx format at the specified path') print("=====================================================================") # saving hyperparameters values params_spatial = {'Spatial Units': num_rows, 'Features': num_cols, 'Kernel': Kernel, 'Bandwidth': bw, 'Spatial Weights': spatial_weights, 'Alpha Weight type': alpha_wt_type, 'Alpha Weight value': alpha_wt, 'Feature Importance': feat_importance, 'Test Size': test_size, 'Seed': seed} params_full = params | params_spatial Params_full = (pd.DataFrame(data=params_full, index=['Value'])).T file_name = (ind[0] + '.xlsx') txt1 = [ 'Pred metrics refer to the full set (after training, prediction using all data). Used only for reference. | oob metrics refer to the local model when the central point is not including. | oobGl metrics refer to the global model when the central point is not including. | ens metrics refer to the local model'] Text1 = pd.DataFrame(txt1, columns=['Notes:']) txt3 = ['Grekousis G. (2025). Geographical-XGBoost: A new ensemble model for spatially local regression based on gradient-boosted trees. Journal of Geographical Systems, https://doi.org/10.1007/s10109-025-00465-4'] Text3 = pd.DataFrame(txt3, columns=['Citation:']) Xls_column_labels = {'IDS': 'Spatial unit id', 'y': 'y_true', 'LM_yPred': 'Local model prediction including central point', 'LM_yOOB': 'Local model prediction excluding central point', 'LM_ResOOB': 'Local model OOB residuals', 'LMRsqr': 'Local model Rsqr (oob)', 'LM_Best_score(RMSE)': 'Local model best score (oob)', 'alpha_wt': 'alpha weight value', 'yGhat': 'Prediction of the global model excluding central point', 'y_ensemble': 'prediction of y through ensemble', 'Imp_': 'Feature local importance', 'MaxImportance': 'Maximum feature importance', 'MaxFeatureID': 'Feature with max importance'} xls_column_labels = (pd.DataFrame(data=Xls_column_labels, index=['Description'])).T # saving hyperparameters, statistics, and local models to xlsx if path_save is False: with pd.ExcelWriter(file_name) as writer: Evaluation_Results.to_excel(writer, sheet_name='Stats', index=True) Params_full.to_excel(writer, sheet_name='Stats', startrow=3, index=True) Text1.to_excel(writer, sheet_name='Stats', startrow=25, index=False) xls_column_labels.to_excel(writer, sheet_name='Stats', startrow=28, index=True) Text3.to_excel(writer, sheet_name='Stats', startrow=43, index=False) LM_results2Excel.to_excel(writer, sheet_name='LocalModels', index=False) else: with pd.ExcelWriter(path_save + file_name) as writer: Evaluation_Results.to_excel(writer, sheet_name='Stats', index=True) Params_full.to_excel(writer, sheet_name='Stats', startrow=3, index=True) Text1.to_excel(writer, sheet_name='Stats', startrow=25, index=False) xls_column_labels.to_excel(writer, sheet_name='Stats', startrow=28, index=True) Text3.to_excel(writer, sheet_name='Stats', startrow=43, index=False) LM_results2Excel.to_excel(writer, sheet_name='LocalModels', index=False) # output results result = {} result['Params'] = Params_full result['Stats'] = Evaluation_Results result['Prediction'] = LM_results2Excel result['alpha_wt'] = LocalAlpha_wt result['y_G_hat'] = y_G_hat result['bestLocalModel'] = bestLocalModel Output_GXGB_LocalModel = result return Output_GXGB_LocalModel
[docs] def predict_gxgb(DataPredict, CoordsPredict, Coords, Output_GXGB_LocalModel, alpha_wt=0.5, alpha_wt_type='varying', path_save=False): """ Prediction in unseen data :param DataPredict: dataframe containing the values of the independent variables referring to the spatial units in which the prediction will take place. :param CoordsPredict: dataframe containing the coordinates of the spatial units in which the prediction will take place. :param Coords: dataframe of coordinates of all spatial units that the original GXGB model was trained :param Output_GXGB_LocalModel: the trained model that has been created through gxgb function :param alpha_wt: the value of alpha weight. It ranges from 0 to 1. Default=0.5 :param alpha_wt_type: type of alpha_wt. Available methods: 'varying', fixed’. Default='varying' :param path_save: output folder. Default=False. :return: prediction in unseen data. """ DistanceMatrix_Predict = pd.DataFrame(distance_matrix(CoordsPredict, Coords)) # Distance matrix nXn num_rows = len(DataPredict) index_min = np.argmin(DistanceMatrix_Predict, axis=1) Y_PRED = [] Alpha_wtDF = pd.DataFrame(Output_GXGB_LocalModel['alpha_wt']) y_G_hat = pd.DataFrame(Output_GXGB_LocalModel['y_G_hat']) bestLocalModel = Output_GXGB_LocalModel['bestLocalModel'] for i in range(num_rows): index = index_min[i] localModel = bestLocalModel[index] pred = pd.DataFrame(DataPredict.iloc[i, :]) # predict y using the local model y_predLoc = localModel.predict(pred.transpose()) # predict y using the global model yGhat = y_G_hat.iloc[index] # predict final if alpha_wt_type == 'fixed': Alpha_wt = alpha_wt print('Calculation with fixed alpha_wt') b = 1 - Alpha_wt # controls the weight of local vs global model. [Alpha_wt] refers to the local model print('a= %.2f, b=%.2f' % (Alpha_wt, b)) else: Alpha_wt = Alpha_wtDF.iloc[index] print('Calculation with varying alpha_wt') b = 1 - Alpha_wt # controls the weight of local vs global model. [Alpha_wt] refers to the local model print('a= %.2f, b=%.2f' % (Alpha_wt.iloc[0], b.iloc[0])) # Combines local and global predictions Y_pred = Alpha_wt * y_predLoc + b * yGhat Y_PRED.append(Y_pred[0]) Predict_results = pd.DataFrame(Y_PRED, columns=['Y_PRED']) if path_save is False: Predict_results.to_csv('Predict_results.csv', index=True) else: Predict_results.to_csv(path_save + 'Predict_results.csv', index=True) print("=============Predict Geographical-XGBoost ==========================") print(Predict_results) print('Results have been saved in xlsx format at the specified path') print("=====================================================================") Output_PredictGXGBoost = Predict_results return Output_PredictGXGBoost