geocrop-platform./plan/original_training.py

514 lines
19 KiB
Python

#only for reference do not change only read this original code produced highly accurate models
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
import re
import subprocess
import sys
import warnings
import joblib
from scipy.signal import savgol_filter
from scipy.stats import linregress
from scipy.integrate import trapezoid
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score, recall_score, top_k_accuracy_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
# Suppress warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings("ignore", category=UserWarning) # Helper for some sklearn versions
pd.set_option('future.no_silent_downcasting', True)
# Check/Install Libraries
def install_libs():
libs = ["minisom", "imbalanced-learn", "xgboost", "lightgbm", "catboost"]
for lib in libs:
try:
__import__(lib.replace("-", "_"))
except ImportError:
print(f"Installing {lib}...")
subprocess.check_call([sys.executable, "-m", "pip", "install", lib])
install_libs()
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from google.colab import drive
# ==========================================
# 1. DATA LOADING
# ==========================================
def load_balanced_data(folder_path):
print(f"Mounting Google Drive and searching in: {folder_path}")
drive.mount('/content/drive')
# Load the RAW balanced dataset created in previous step
# We use RAW because smoothing/phenology calcs need physical units, not scaled values
file_path = os.path.join(folder_path, "Zimbabwe_Crop_Balanced_Raw.csv")
if os.path.exists(file_path):
print(f"Loading Balanced Raw Data: {file_path}")
df = pd.read_csv(file_path)
print(f"Data Loaded. Shape: {df.shape}")
return df
else:
print("Error: 'Zimbabwe_Crop_Balanced_Raw.csv' not found. Please run the balancing step first.")
return None
# ==========================================
# 2. FEATURE ENGINEERING: SMOOTHING
# ==========================================
def apply_smoothing(df, indices=['ndvi', 'ndre', 'evi', 'savi', 'ci_re', 'ndwi']):
print("\n[FE] Applying Savitzky-Golay Smoothing (with 0-interpolation)...")
# Get dates
date_pattern = re.compile(r"(\d{8})_")
dates = sorted(list(set([date_pattern.search(col).group(1) for col in df.columns if date_pattern.search(col)])))
eng_df = df.copy()
for idx in indices:
cols = [f"{d}_{idx}" for d in dates if f"{d}_{idx}" in df.columns]
if not cols: continue
# 1. Replace 0 with NaN (assuming 0 indicates gap/fill)
raw_df = df[cols].replace(0, np.nan)
# 2. Interpolate linearly across time (axis=1) to fill gaps before smoothing
interp_values = raw_df.interpolate(method='linear', axis=1, limit_direction='both').fillna(0).values
# 3. Apply smoothing row-wise
# window_length=5 (approx 50 days), polyorder=2
smooth_values = savgol_filter(interp_values, window_length=5, polyorder=2, axis=1)
# Store smooth values with new suffix
smooth_cols = [c + "_smooth" for c in cols]
eng_df[smooth_cols] = smooth_values
return eng_df, dates
# ==========================================
# 3. FEATURE ENGINEERING: PHENOLOGY
# ==========================================
def extract_phenology(df, dates, indices=['ndvi', 'ndre', 'evi']):
print("[FE] Extracting Phenology Metrics (Green-up, Peak, Senescence)...")
eng_df = df.copy()
for idx in indices:
# Use SMOOTHED columns if available
cols = [f"{d}_{idx}_smooth" for d in dates]
if not all([c in df.columns for c in cols]):
cols = [f"{d}_{idx}" for d in dates] # Fallback to raw
if not cols: continue
values = df[cols].values
# 1. Magnitude Metrics
eng_df[f'{idx}_max'] = np.max(values, axis=1)
eng_df[f'{idx}_min'] = np.min(values, axis=1)
eng_df[f'{idx}_mean'] = np.mean(values, axis=1)
eng_df[f'{idx}_std'] = np.std(values, axis=1)
eng_df[f'{idx}_amplitude'] = eng_df[f'{idx}_max'] - eng_df[f'{idx}_min']
# 2. Area Under Curve (Integral) with dx=10 (10-day intervals)
eng_df[f'{idx}_auc'] = trapezoid(values, dx=10, axis=1)
# 3. Timing Metrics (Vectorized approximation)
# Argmax gives the index of the peak
peak_indices = np.argmax(values, axis=1)
eng_df[f'{idx}_peak_timestep'] = peak_indices
# Rates (Slopes)
slopes = np.diff(values, axis=1)
eng_df[f'{idx}_max_slope_up'] = np.max(slopes, axis=1)
eng_df[f'{idx}_max_slope_down'] = np.min(slopes, axis=1)
return eng_df
# ==========================================
# 4. FEATURE ENGINEERING: HARMONICS
# ==========================================
def add_harmonics(df, dates, indices=['ndvi']):
print("[FE] Fitting Harmonic/Fourier Features (Normalized)...")
eng_df = df.copy()
# Convert dates to relative time (0 to 1 scaling or radians)
time_steps = np.arange(len(dates))
t = 2 * np.pi * time_steps / len(dates) # Normalize to one full cycle 0-2pi
sin_t = np.sin(t)
cos_t = np.cos(t)
sin_2t = np.sin(2*t)
cos_2t = np.cos(2*t)
n_dates = len(dates)
for idx in indices:
# Prefer smoothed values for cleaner harmonics
cols = [f"{d}_{idx}_smooth" for d in dates]
if not all(c in df.columns for c in cols):
cols = [f"{d}_{idx}" for d in dates]
if not cols: continue
values = df[cols].values
# Normalized coefficients
eng_df[f'{idx}_harmonic1_sin'] = np.dot(values, sin_t) / n_dates
eng_df[f'{idx}_harmonic1_cos'] = np.dot(values, cos_t) / n_dates
eng_df[f'{idx}_harmonic2_sin'] = np.dot(values, sin_2t) / n_dates
eng_df[f'{idx}_harmonic2_cos'] = np.dot(values, cos_2t) / n_dates
return eng_df
# ==========================================
# 5. FEATURE ENGINEERING: INTERACTIONS & WINDOWS
# ==========================================
def add_interactions_and_windows(df, dates):
print("[FE] Computing Multi-Index Interactions & Seasonal Windows...")
eng_df = df.copy()
# Interactions
if 'ndvi_max' in df.columns and 'ndre_max' in df.columns:
eng_df['ndvi_ndre_peak_diff'] = df['ndvi_max'] - df['ndre_max']
if 'evi_mean' in df.columns and 'ndvi_mean' in df.columns:
eng_df['canopy_density_contrast'] = df['evi_mean'] / (df['ndvi_mean'] + 0.001)
# Windowed Summaries (assuming Zimbabwe Season: Oct-Jun)
# Correctly parsing dates to handle year crossover
dt_dates = pd.to_datetime(dates, format='%Y%m%d')
date_to_col_map = {d: d for d in dates} # Helper if needed
# Define Windows (Months)
windows = {
'early': [10, 11, 12], # Oct-Dec
'peak': [1, 2, 3], # Jan-Mar
'late': [4, 5, 6] # Apr-Jun
}
for idx in ['ndvi', 'ndwi', 'ndre']:
# Prefer smooth if avail
suffix = "_smooth" if f"{dates[0]}_{idx}_smooth" in df.columns else ""
for win_name, months in windows.items():
# Identify columns belonging to this window
relevant_dates = [d_str for d_dt, d_str in zip(dt_dates, dates) if d_dt.month in months]
relevant_cols = [f"{d}_{idx}{suffix}" for d in relevant_dates if f"{d}_{idx}{suffix}" in df.columns]
if relevant_cols:
eng_df[f'{idx}_{win_name}_mean'] = df[relevant_cols].mean(axis=1)
eng_df[f'{idx}_{win_name}_max'] = df[relevant_cols].max(axis=1)
return eng_df
# ==========================================
# 6. FEATURE SELECTION (Helper)
# ==========================================
def get_selected_feature_names(X_train, y_train):
print("\n" + "="*40)
print("FEATURE SELECTION (Inside Train Split)")
print("="*40)
# 1. Train scout LightGBM
print("Training scout LightGBM model...")
lgbm = lgb.LGBMClassifier(n_estimators=100, random_state=42, verbose=-1)
lgbm.fit(X_train, y_train)
# 2. Get Importance
importances = pd.DataFrame({
'Feature': X_train.columns,
'Importance': lgbm.feature_importances_
}).sort_values('Importance', ascending=False)
print("Top 10 Features:")
print(importances.head(10))
# 3. Keep non-zero importance features
selected_feats = importances[importances['Importance'] > 0]['Feature'].tolist()
print(f"Kept {len(selected_feats)} features (dropped {len(X_train.columns) - len(selected_feats)} zero-importance).")
return selected_feats
# ==========================================
# 7. MODEL EVALUATION VISUALIZATIONS
# ==========================================
def plot_confusion_matrix_custom(y_true, y_pred, classes, model_name, folder_path, dataset_suffix=""):
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(14, 12))
# Use annotation only if reasonable number of classes
annot = len(classes) < 25
sns.heatmap(cm, annot=annot, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes, cbar=False)
plt.title(f'Confusion Matrix: {model_name} ({dataset_suffix})', fontsize=16)
plt.xlabel('Predicted Label', fontsize=12)
plt.ylabel('True Label', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig(os.path.join(folder_path, f"ConfMat_{model_name}_{dataset_suffix}.png"))
plt.show()
plt.close()
def plot_feature_importance_custom(model, feature_names, model_name, folder_path, dataset_suffix="", top_n=20):
if not hasattr(model, 'feature_importances_'):
return
imp = model.feature_importances_
df_imp = pd.DataFrame({'feature': feature_names, 'importance': imp})
df_imp = df_imp.sort_values('importance', ascending=False).head(top_n)
plt.figure(figsize=(10, 8))
sns.barplot(x='importance', y='feature', data=df_imp, palette='viridis')
plt.title(f'Top {top_n} Features: {model_name} ({dataset_suffix})', fontsize=16)
plt.xlabel('Importance Score')
plt.tight_layout()
plt.savefig(os.path.join(folder_path, f"FeatImp_{model_name}_{dataset_suffix}.png"))
plt.show()
plt.close()
def plot_per_class_f1(y_true, y_pred, classes, model_name, folder_path, dataset_suffix=""):
# Fix: Added zero_division=0 to suppress UndefinedMetricWarning for empty classes
report = classification_report(y_true, y_pred, target_names=classes, output_dict=True, zero_division=0)
df_report = pd.DataFrame(report).transpose()
# Drop avg rows
df_report = df_report.drop(['accuracy', 'macro avg', 'weighted avg'], errors='ignore')
plt.figure(figsize=(14, 6))
sns.barplot(x=df_report.index, y=df_report['f1-score'], palette='coolwarm')
plt.title(f'F1-Score per Class: {model_name} ({dataset_suffix})', fontsize=16)
plt.xticks(rotation=45, ha='right')
plt.ylim(0, 1.05)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(folder_path, f"ClassF1_{model_name}_{dataset_suffix}.png"))
plt.show()
plt.close()
# Save Report to CSV
df_report.to_csv(os.path.join(folder_path, f"Report_{model_name}_{dataset_suffix}.csv"))
# ==========================================
# 8. TRAINING & COMPARISON LOOP
# ==========================================
def train_and_compare(df, folder_path, dataset_suffix="Raw"):
print("\n" + "="*40)
print(f"MODEL TRAINING & COMPARISON ({dataset_suffix} Data)")
print("="*40)
# 1. Drop Junk Columns (Spatial Leakage Prevention)
# Explicitly including 'is_syn' and 'system:index' as requested
junk_cols = ['.geo', 'system:index', 'latitude', 'longitude', 'lat', 'lon', 'ID', 'parent_id', 'batch_id', 'is_syn']
# Filter only those present
cols_to_drop = [c for c in junk_cols if c in df.columns]
print(f"Dropping junk/spatial columns: {cols_to_drop}")
df_clean = df.drop(columns=cols_to_drop)
X = df_clean.drop(columns=['label'])
y = df_clean['label']
le = LabelEncoder()
y_enc = le.fit_transform(y)
class_names = le.classes_
# 2. Split Data FIRST (Fixing Data Leakage)
X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, random_state=42, stratify=y_enc)
# 3. Feature Selection on X_train ONLY
selected_features = get_selected_feature_names(X_train, y_train)
# Apply selection
X_train = X_train[selected_features]
X_test = X_test[selected_features]
print(f"Final Feature Count: {X_train.shape[1]}")
# Define Models with Robust Configs
models = {
'RandomForest': RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=42, class_weight='balanced'),
'XGBoost': xgb.XGBClassifier(
n_estimators=300,
learning_rate=0.05,
max_depth=7,
subsample=0.8,
colsample_bytree=0.8,
eval_metric='mlogloss',
n_jobs=-1,
random_state=42
),
'LightGBM': lgb.LGBMClassifier(
n_estimators=800,
learning_rate=0.03,
num_leaves=63,
subsample=0.8,
colsample_bytree=0.8,
min_child_samples=30,
class_weight='balanced',
n_jobs=-1,
random_state=42,
verbose=-1
),
'CatBoost': CatBoostClassifier(
iterations=500,
learning_rate=0.05,
depth=6,
verbose=0,
random_seed=42,
auto_class_weights='Balanced'
)
}
results = []
trained_models = {}
for name, model in models.items():
print(f"\n--- Training {name} ({dataset_suffix}) ---")
model.fit(X_train, y_train)
# Probabilities needed for Top-K
if hasattr(model, "predict_proba"):
probs = model.predict_proba(X_test)
preds = np.argmax(probs, axis=1)
# Check k constraints (k must be < n_classes)
k = 3 if len(class_names) > 3 else 1
top_k_acc = top_k_accuracy_score(y_test, probs, k=k)
else:
preds = model.predict(X_test)
top_k_acc = 0.0
# Metrics
# Zero division check handled inside sklearn for some versions, but standard accuracy/f1 are fine
acc = accuracy_score(y_test, preds)
f1_macro = f1_score(y_test, preds, average='macro', zero_division=0)
f1_weighted = f1_score(y_test, preds, average='weighted', zero_division=0)
print(f"{name} -> Acc: {acc:.4f}, F1-Macro: {f1_macro:.4f}, F1-Weighted: {f1_weighted:.4f}, Top-{k}: {top_k_acc:.4f}")
# Store Results
results.append({
'Model': name,
'Dataset': dataset_suffix,
'Accuracy': acc,
'F1-Macro': f1_macro,
'F1-Weighted': f1_weighted,
f'Top-{k} Acc': top_k_acc
})
trained_models[name] = model
# Save Model PKL
pkl_path = os.path.join(folder_path, f"Zimbabwe_{name}_{dataset_suffix}_Model.pkl")
joblib.dump(model, pkl_path)
# VISUALIZATIONS
plot_confusion_matrix_custom(y_test, preds, class_names, name, folder_path, dataset_suffix)
plot_per_class_f1(y_test, preds, class_names, name, folder_path, dataset_suffix)
plot_feature_importance_custom(model, selected_features, name, folder_path, dataset_suffix)
# --- Voting Ensemble ---
print(f"\n--- Training Voting Ensemble (Soft Vote) - {dataset_suffix} ---")
ensemble = VotingClassifier(
estimators=[(n, m) for n, m in trained_models.items()],
voting='soft', n_jobs=-1
)
ensemble.fit(X_train, y_train)
ens_probs = ensemble.predict_proba(X_test)
ens_preds = np.argmax(ens_probs, axis=1)
k = 3 if len(class_names) > 3 else 1
ens_acc = accuracy_score(y_test, ens_preds)
ens_f1 = f1_score(y_test, ens_preds, average='macro', zero_division=0)
ens_f1_w = f1_score(y_test, ens_preds, average='weighted', zero_division=0)
ens_topk = top_k_accuracy_score(y_test, ens_probs, k=k)
print(f"Ensemble -> Acc: {ens_acc:.4f}, F1-Macro: {ens_f1:.4f}, F1-Weighted: {ens_f1_w:.4f}, Top-{k}: {ens_topk:.4f}")
# Save Ensemble & Plots
joblib.dump(ensemble, os.path.join(folder_path, f"Zimbabwe_Ensemble_{dataset_suffix}_Model.pkl"))
plot_confusion_matrix_custom(y_test, ens_preds, class_names, "Ensemble", folder_path, dataset_suffix)
plot_per_class_f1(y_test, ens_preds, class_names, "Ensemble", folder_path, dataset_suffix)
# Final Comparison Plot
results.append({
'Model': 'Ensemble',
'Dataset': dataset_suffix,
'Accuracy': ens_acc,
'F1-Macro': ens_f1,
'F1-Weighted': ens_f1_w,
f'Top-{k} Acc': ens_topk
})
return results
# ==========================================
# MAIN EXECUTION
# ==========================================
if __name__ == "__main__":
print("\n--- STARTING UPDATED CROP MAPPING SCRIPT ---")
FOLDER_PATH = '/content/drive/MyDrive/GEE_Crop_Mapping_V2_Universal'
# 1. Load Data
df = load_balanced_data(FOLDER_PATH)
if df is not None:
# 2. Feature Engineering Pipeline
df_smooth, dates = apply_smoothing(df)
df_pheno = extract_phenology(df_smooth, dates)
df_harm = add_harmonics(df_pheno, dates)
df_full = add_interactions_and_windows(df_harm, dates)
# Save Engineered Dataset (All columns, before split/selection)
eng_path = os.path.join(FOLDER_PATH, "Zimbabwe_Crop_Engineered_Ready.csv")
df_full.to_csv(eng_path, index=False)
print(f"\nSaved Engineered Dataset (All Features): {eng_path}")
# 3. Prepare Two Datasets: Raw vs Scaled
print("\nPreparing Datasets: Raw vs StandardScaled...")
# Raw is just df_full
df_raw = df_full.copy()
# Scaled: Apply StandardScaler to features only
df_scaled = df_full.copy()
features = df_scaled.drop(columns=['label']).columns
scaler = StandardScaler()
df_scaled[features] = scaler.fit_transform(df_scaled[features])
# 4. Train & Compare Both
all_results = []
# Train on Raw
res_raw = train_and_compare(df_raw, FOLDER_PATH, dataset_suffix="Raw")
all_results.extend(res_raw)
# Train on Scaled
res_scaled = train_and_compare(df_scaled, FOLDER_PATH, dataset_suffix="Scaled")
all_results.extend(res_scaled)
# 5. Final Combined Comparison
final_df = pd.DataFrame(all_results)
print("\nFinal Comparative Results (Raw vs Scaled):")
print(final_df[['Model', 'Dataset', 'Accuracy', 'F1-Macro']])
# Combined Plot
plt.figure(figsize=(14, 7))
sns.barplot(data=final_df, x='Model', y='F1-Macro', hue='Dataset', palette='Paired')
plt.title("Model Comparison: Raw vs StandardScaled Features (F1-Macro)", fontsize=16)
plt.ylim(0, 1.05)
plt.grid(axis='y', alpha=0.3)
plt.legend(title='Dataset Version', loc='lower right')
plt.tight_layout()
plt.savefig(os.path.join(FOLDER_PATH, "Final_Comparison_Raw_vs_Scaled.png"))
plt.show()