geocrop-platform./training/train_v2.py

import sys
import os
import logging
from training.storage_client import MinIOStorageClient
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import mlflow
import mlflow.sklearn
import joblib

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def train():
    # 1. Setup Environment
    # MLflow server URL (Internal K8s DNS)
    mlflow.set_tracking_uri("http://mlflow.geocrop.svc.cluster.local:5000")
    mlflow.set_experiment("GeoCrop_Retraining")

    storage = MinIOStorageClient()

    # 2. Fetch Data
    logger.info("Fetching latest training data...")
    # For now, let's load batch_1.csv as an example
    df = storage.load_dataset('geocrop-datasets', 'batch_1.csv')

    if df is None or df.empty:
        logger.error("No data found!")
        return

    # 3. Simple Feature Engineering (Placeholder)
    # Filter columns to only include features and target
    # FEATURE_ORDER_V1 (subset for example)
    features = ['ndvi_peak', 'evi_peak', 'savi_peak']
    target = 'class' # Update with actual column name

    X = df[features]
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 4. Training with MLflow Tracking
    with mlflow.start_run():
        logger.info("Training model...")
        model = XGBClassifier(n_estimators=100, max_depth=5)
        model.fit(X_train, y_train)

        # Log Metrics
        score = model.score(X_test, y_test)
        mlflow.log_metric("accuracy", score)
        logger.info(f"Accuracy: {score}")

        # Log Model to MLflow
        mlflow.sklearn.log_model(model, "model")

        # 5. Save Artifact and Upload to MinIO
        model_filename = "Zimbabwe_Ensemble_Model_latest.pkl"
        joblib.dump(model, model_filename)

        storage.upload_file(model_filename, 'geocrop-models', model_filename)
        logger.info(f"Uploaded model to MinIO: {model_filename}")

        # 6. Generate Inference Script (Dynamic)
        generate_inference_script(features)

def generate_inference_script(features):
    """
    Creates a new inference script tailored to the trained model's features.
    """
    script_content = f"""
import os
from training.storage_client import MinIOStorageClient
import joblib
import pandas as pd

def run_inference(aoi_data):
    storage = MinIOStorageClient()
    # Download the latest model
    storage.download_file('geocrop-models', 'Zimbabwe_Ensemble_Model_latest.pkl', 'model.pkl')
    model = joblib.load('model.pkl')

    # Preprocess aoi_data (should pull from DEA STAC here)
    # features = {features}
    # ... logic to fetch from DEA STAC and compute these features ...

    # result = model.predict(preprocessed_data)
    # return result
"""
    with open("training/generated_inference.py", "w") as f:
        f.write(script_content)
    logger.info("Generated new inference script: training/generated_inference.py")

if __name__ == "__main__":
    train()