Update storage client with load_dataset and add comprehensive README

2026-04-23 22:29:19 +02:00 · 2026-04-23 22:29:19 +02:00 · bdc4d52f21
parent 182bf12e2d
commit bdc4d52f21
2 changed files with 136 additions and 49 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,68 @@
 # Sovereign MLOps Platform: GeoCrop LULC Portfolio
 Welcome to the **Sovereign MLOps Platform**, a comprehensive self-hosted environment on K3s designed for end-to-end Land Use / Land Cover (LULC) crop-mapping in Zimbabwe.
 This project showcases professional skills in **MLOps, Cloud-Native Architecture, Geospatial Analysis, and GitOps**.
 ## 🏗️ System Architecture
 The platform is built on a robust, self-hosted Kubernetes (K3s) cluster with a focus on data sovereignty and scalability.
 - **Source Control & CI/CD**: [Gitea](https://git.techarvest.co.zw) (Self-hosted GitHub alternative)
 - **Infrastructure as Code**: Terraform (Managing K3s Namespaces & Quotas)
 - **GitOps**: ArgoCD (Automated deployment from Git to Cluster)
 - **Experiment Tracking**: [MLflow](https://ml.techarvest.co.zw) (Model versioning & metrics)
 - **Interactive Workspace**: [JupyterLab](https://lab.techarvest.co.zw) (Data science & training)
 - **Spatial Database**: Standalone PostgreSQL + PostGIS (Port 5433)
 - **Object Storage**: MinIO (S3-compatible storage for datasets, baselines, and models)
 - **Frontend**: React 19 + OpenLayers (Parallel loading of baselines and ML predictions)
 - **Backend**: FastAPI + Redis Queue (Job orchestration)
 - **Visualization**: TiTiler (Dynamic tile server for Cloud Optimized GeoTIFFs)
 ## 🗺️ UX Data Flow: Parallel Loading Strategy
 To ensure a seamless user experience, the system implements a dual-loading strategy:
 1. **Instant Context**: While waiting for ML inference, Dynamic World (DW) TIFF baselines (2015-2025) are immediately served from MinIO via TiTiler.
 2. **Asynchronous Inference**: The ML worker processes heavy classification tasks in the background and overlays high-resolution predictions once complete.
 ## 🛠️ Training Workflow
 Training is performed in **JupyterLab** using a custom `MinIOStorageClient` that bridges the gap between object storage and in-memory data processing.
 ### Using the MinIO Storage Client
 ```python
 from training.storage_client import MinIOStorageClient
 # Initialize client (uses environment variables automatically)
 storage = MinIOStorageClient()
 # List available training batches
 batches = storage.list_files('geocrop-datasets')
 # Load a batch directly into memory (No disk I/O)
 df = storage.load_dataset('geocrop-datasets', 'batch_1.csv')
 # Train your model and upload the artifact
 # ... training code ...
 storage.upload_file('model.pkl', 'geocrop-models', 'Zimbabwe_Ensemble_Model.pkl')
 ```
 ## 🚀 Deployment & GitOps
 The platform follows a strict **GitOps** workflow:
 1. All changes are committed to the `geocrop-platform` repository on Gitea.
 2. Gitea Actions build and push containers to Docker Hub (`frankchine`).
 3. ArgoCD monitors the `k8s/base` directory and automatically synchronizes the cluster state.
 ## 🖥️ Service Registry
 - **Portfolio Frontend**: [portfolio.techarvest.co.zw](https://portfolio.techarvest.co.zw)
 - **Source Control**: [git.techarvest.co.zw](https://git.techarvest.co.zw)
 - **JupyterLab**: [lab.techarvest.co.zw](https://lab.techarvest.co.zw)
 - **MLflow**: [ml.techarvest.co.zw](https://ml.techarvest.co.zw)
 - **ArgoCD**: [cd.techarvest.co.zw](https://cd.techarvest.co.zw)
 - **MinIO Console**: [console.minio.portfolio.techarvest.co.zw](https://console.minio.portfolio.techarvest.co.zw)
 ---
 *Created and maintained by [fchinembiri](mailto:fchinembiri24@gmail.com).*
--- a/training/storage_client.py
+++ b/training/storage_client.py
@ -2,83 +2,102 @@ import boto3
 import os
 import logging
 import pandas as pd
 import io
 from botocore.exceptions import ClientError
 from io import BytesIO
-# Configure logging
+# Configure logging for the worker/training scripts
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 class MinIOStorageClient:
    """
-    A professional client for interacting with GeoCrop MinIO storage.
+    A reusable client for interacting with the local MinIO storage.
-    Optimized for JupyterLab training workflows.
+    Handles Datasets (CSVs), Baselines (DW TIFFs), and Model Artifacts.
    """
    def __init__(self):
-        # Configuration from Kubernetes Environment Variables
+        # Initialize S3 client using environment variables
        # Defaults to the internal Kubernetes DNS for MinIO if not provided
        self.endpoint_url = os.environ.get('AWS_S3_ENDPOINT_URL', 'http://minio.geocrop.svc.cluster.local:9000')
        self.access_key = os.environ.get('AWS_ACCESS_KEY_ID')
        self.secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
        self.s3_client = boto3.client(
            's3',
            endpoint_url=self.endpoint_url,
-            aws_access_key_id=self.access_key,
+            aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'),
-            aws_secret_access_key=self.secret_key,
+            aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY'),
            config=boto3.session.Config(signature_version='s3v4'),
-            verify=False  # Required for internal self-signed K3s certs
+            verify=False  # MinIO often uses self-signed certs internally
        )
    def list_files(self, bucket_name: str, prefix: str = "") -> list:
-        """Lists metadata for all files in a bucket/folder."""
+        """Lists files in a specific bucket, optionally filtered by a prefix folder."""
        try:
            response = self.s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
            files = []
            logger.info(f"Scanning bucket '{bucket_name}' with prefix '{prefix}'...")
            if 'Contents' in response:
                for obj in response['Contents']:
-                    files.append({
+                    size_mb = obj['Size'] / 1024 / 1024
-                        "key": obj['Key'], 
+                    files.append({"key": obj['Key'], "size_mb": size_mb})
-                        "size_mb": round(obj['Size'] / (1024 * 1024), 2),
+                    logger.info(f" - Found: {obj['Key']} ({size_mb:.2f} MB)")
-                        "last_modified": obj['LastModified']
+            else:
-                    })
+                logger.warning(f"No files found in bucket {bucket_name}.")
            return files
        except ClientError as e:
-            logger.error(f"Failed to list {bucket_name}: {e}")
+            logger.error(f"Failed to list files in {bucket_name}: {e}")
            return []
    def download_file(self, bucket_name: str, object_name: str, download_path: str) -> bool:
        """Downloads a file from MinIO to the local pod/container storage."""
        try:
            logger.info(f"Downloading {object_name} from {bucket_name} to {download_path}...")
            self.s3_client.download_file(bucket_name, object_name, download_path)
            logger.info("Download complete.")
            return True
        except ClientError as e:
            logger.error(f"Error downloading {object_name}: {e}")
            return False
    def upload_file(self, file_path: str, bucket_name: str, object_name: str) -> bool:
        """Uploads a local file (like a trained model or prediction COG) back to MinIO."""
        try:
            logger.info(f"Uploading {file_path} to {bucket_name}/{object_name}...")
            self.s3_client.upload_file(file_path, bucket_name, object_name)
            logger.info("Upload complete.")
            return True
        except ClientError as e:
            logger.error(f"Error uploading {file_path}: {e}")
            return False
    def load_dataset(self, bucket_name: str, object_name: str) -> pd.DataFrame:
-        """
+        """Loads a CSV dataset directly from MinIO into a Pandas DataFrame in memory."""
        Reads a CSV directly from MinIO into a Pandas DataFrame without saving to disk.
        Best for training on large batches.
        """
        try:
-            logger.info(f"Streaming s3://{bucket_name}/{object_name} into memory...")
+            logger.info(f"Loading {object_name} from {bucket_name} into memory...")
-            obj = self.s3_client.get_object(Bucket=bucket_name, Key=object_name)
+            response = self.s3_client.get_object(Bucket=bucket_name, Key=object_name)
-            return pd.read_csv(BytesIO(obj['Body'].read()))
+            df = pd.read_csv(io.BytesIO(response['Body'].read()))
            logger.info(f"Successfully loaded dataset with shape: {df.shape}")
            return df
        except ClientError as e:
            logger.error(f"Error loading {object_name}: {e}")
            return None
        except Exception as e:
-            logger.error(f"Failed to load dataset {object_name}: {e}")
+            logger.error(f"Error parsing {object_name} into DataFrame: {e}")
-            return pd.DataFrame()
+            return None
-    def download_file(self, bucket_name: str, object_name: str, local_path: str):
+# ==========================================
-        """Downloads a binary file (e.g., a TIFF baseline) to local storage."""
+# Example Usage (For your Jupyter Notebooks)
-        try:
+# ==========================================
-            self.s3_client.download_file(bucket_name, object_name, local_path)
+if __name__ == "__main__":
-            logger.info(f"Successfully downloaded to {local_path}")
+    storage = MinIOStorageClient()
-        except ClientError as e:
+    
-            logger.error(f"Download failed: {e}")
+    # 1. List the Zimbabwe Augmented CSV batches
-
+    datasets_bucket = 'geocrop-datasets'
-    def upload_artifact(self, local_path: str, bucket_name: str, object_name: str):
+    csv_files = storage.list_files(datasets_bucket)
-        """Uploads a local file (e.g., a trained model .pkl) back to MinIO."""
+    
-        try:
+    # 2. Load a batch directly into memory for training
-            self.s3_client.upload_file(local_path, bucket_name, object_name)
+    if csv_files:
-            logger.info(f"Artifact uploaded to s3://{bucket_name}/{object_name}")
+        first_batch_key = csv_files[0]['key']
-        except ClientError as e:
+        df = storage.load_dataset(datasets_bucket, first_batch_key)
-            logger.error(f"Upload failed: {e}")
+        if df is not None:
-
+            print(df.info())
    def get_pandas_options(self):
        """Returns a dict to use with pd.read_csv('s3://...') if preferred."""
        return {
            "key": self.access_key,
            "secret": self.secret_key,
            "client_kwargs": {"endpoint_url": self.endpoint_url}
        }