import sys import os import logging from training.storage_client import MinIOStorageClient import pandas as pd from sklearn.model_selection import train_test_split from xgboost import XGBClassifier import mlflow import mlflow.sklearn import joblib # Setup logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def train(): # 1. Setup Environment # MLflow server URL (Internal K8s DNS) mlflow.set_tracking_uri("http://mlflow.geocrop.svc.cluster.local:5000") mlflow.set_experiment("GeoCrop_Retraining") storage = MinIOStorageClient() # 2. Fetch Data logger.info("Fetching latest training data...") # For now, let's load batch_1.csv as an example df = storage.load_dataset('geocrop-datasets', 'batch_1.csv') if df is None or df.empty: logger.error("No data found!") return # 3. Simple Feature Engineering (Placeholder) # Filter columns to only include features and target # FEATURE_ORDER_V1 (subset for example) features = ['ndvi_peak', 'evi_peak', 'savi_peak'] target = 'class' # Update with actual column name X = df[features] y = df[target] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 4. Training with MLflow Tracking with mlflow.start_run(): logger.info("Training model...") model = XGBClassifier(n_estimators=100, max_depth=5) model.fit(X_train, y_train) # Log Metrics score = model.score(X_test, y_test) mlflow.log_metric("accuracy", score) logger.info(f"Accuracy: {score}") # Log Model to MLflow mlflow.sklearn.log_model(model, "model") # 5. Save Artifact and Upload to MinIO model_filename = "Zimbabwe_Ensemble_Model_latest.pkl" joblib.dump(model, model_filename) storage.upload_file(model_filename, 'geocrop-models', model_filename) logger.info(f"Uploaded model to MinIO: {model_filename}") # 6. Generate Inference Script (Dynamic) generate_inference_script(features) def generate_inference_script(features): """ Creates a new inference script tailored to the trained model's features. """ script_content = f""" import os from training.storage_client import MinIOStorageClient import joblib import pandas as pd def run_inference(aoi_data): storage = MinIOStorageClient() # Download the latest model storage.download_file('geocrop-models', 'Zimbabwe_Ensemble_Model_latest.pkl', 'model.pkl') model = joblib.load('model.pkl') # Preprocess aoi_data (should pull from DEA STAC here) # features = {features} # ... logic to fetch from DEA STAC and compute these features ... # result = model.predict(preprocessed_data) # return result """ with open("training/generated_inference.py", "w") as f: f.write(script_content) logger.info("Generated new inference script: training/generated_inference.py") if __name__ == "__main__": train()