Update storage client with load_dataset and add comprehensive README

This commit is contained in:
fchinembiri 2026-04-23 22:29:19 +02:00
parent 182bf12e2d
commit bdc4d52f21
2 changed files with 136 additions and 49 deletions

68
README.md Normal file
View File

@ -0,0 +1,68 @@
# Sovereign MLOps Platform: GeoCrop LULC Portfolio
Welcome to the **Sovereign MLOps Platform**, a comprehensive self-hosted environment on K3s designed for end-to-end Land Use / Land Cover (LULC) crop-mapping in Zimbabwe.
This project showcases professional skills in **MLOps, Cloud-Native Architecture, Geospatial Analysis, and GitOps**.
## 🏗️ System Architecture
The platform is built on a robust, self-hosted Kubernetes (K3s) cluster with a focus on data sovereignty and scalability.
- **Source Control & CI/CD**: [Gitea](https://git.techarvest.co.zw) (Self-hosted GitHub alternative)
- **Infrastructure as Code**: Terraform (Managing K3s Namespaces & Quotas)
- **GitOps**: ArgoCD (Automated deployment from Git to Cluster)
- **Experiment Tracking**: [MLflow](https://ml.techarvest.co.zw) (Model versioning & metrics)
- **Interactive Workspace**: [JupyterLab](https://lab.techarvest.co.zw) (Data science & training)
- **Spatial Database**: Standalone PostgreSQL + PostGIS (Port 5433)
- **Object Storage**: MinIO (S3-compatible storage for datasets, baselines, and models)
- **Frontend**: React 19 + OpenLayers (Parallel loading of baselines and ML predictions)
- **Backend**: FastAPI + Redis Queue (Job orchestration)
- **Visualization**: TiTiler (Dynamic tile server for Cloud Optimized GeoTIFFs)
## 🗺️ UX Data Flow: Parallel Loading Strategy
To ensure a seamless user experience, the system implements a dual-loading strategy:
1. **Instant Context**: While waiting for ML inference, Dynamic World (DW) TIFF baselines (2015-2025) are immediately served from MinIO via TiTiler.
2. **Asynchronous Inference**: The ML worker processes heavy classification tasks in the background and overlays high-resolution predictions once complete.
## 🛠️ Training Workflow
Training is performed in **JupyterLab** using a custom `MinIOStorageClient` that bridges the gap between object storage and in-memory data processing.
### Using the MinIO Storage Client
```python
from training.storage_client import MinIOStorageClient
# Initialize client (uses environment variables automatically)
storage = MinIOStorageClient()
# List available training batches
batches = storage.list_files('geocrop-datasets')
# Load a batch directly into memory (No disk I/O)
df = storage.load_dataset('geocrop-datasets', 'batch_1.csv')
# Train your model and upload the artifact
# ... training code ...
storage.upload_file('model.pkl', 'geocrop-models', 'Zimbabwe_Ensemble_Model.pkl')
```
## 🚀 Deployment & GitOps
The platform follows a strict **GitOps** workflow:
1. All changes are committed to the `geocrop-platform` repository on Gitea.
2. Gitea Actions build and push containers to Docker Hub (`frankchine`).
3. ArgoCD monitors the `k8s/base` directory and automatically synchronizes the cluster state.
## 🖥️ Service Registry
- **Portfolio Frontend**: [portfolio.techarvest.co.zw](https://portfolio.techarvest.co.zw)
- **Source Control**: [git.techarvest.co.zw](https://git.techarvest.co.zw)
- **JupyterLab**: [lab.techarvest.co.zw](https://lab.techarvest.co.zw)
- **MLflow**: [ml.techarvest.co.zw](https://ml.techarvest.co.zw)
- **ArgoCD**: [cd.techarvest.co.zw](https://cd.techarvest.co.zw)
- **MinIO Console**: [console.minio.portfolio.techarvest.co.zw](https://console.minio.portfolio.techarvest.co.zw)
---
*Created and maintained by [fchinembiri](mailto:fchinembiri24@gmail.com).*

View File

@ -2,83 +2,102 @@ import boto3
import os import os
import logging import logging
import pandas as pd import pandas as pd
import io
from botocore.exceptions import ClientError from botocore.exceptions import ClientError
from io import BytesIO
# Configure logging # Configure logging for the worker/training scripts
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class MinIOStorageClient: class MinIOStorageClient:
""" """
A professional client for interacting with GeoCrop MinIO storage. A reusable client for interacting with the local MinIO storage.
Optimized for JupyterLab training workflows. Handles Datasets (CSVs), Baselines (DW TIFFs), and Model Artifacts.
""" """
def __init__(self): def __init__(self):
# Configuration from Kubernetes Environment Variables # Initialize S3 client using environment variables
# Defaults to the internal Kubernetes DNS for MinIO if not provided
self.endpoint_url = os.environ.get('AWS_S3_ENDPOINT_URL', 'http://minio.geocrop.svc.cluster.local:9000') self.endpoint_url = os.environ.get('AWS_S3_ENDPOINT_URL', 'http://minio.geocrop.svc.cluster.local:9000')
self.access_key = os.environ.get('AWS_ACCESS_KEY_ID')
self.secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
self.s3_client = boto3.client( self.s3_client = boto3.client(
's3', 's3',
endpoint_url=self.endpoint_url, endpoint_url=self.endpoint_url,
aws_access_key_id=self.access_key, aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'),
aws_secret_access_key=self.secret_key, aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY'),
config=boto3.session.Config(signature_version='s3v4'), config=boto3.session.Config(signature_version='s3v4'),
verify=False # Required for internal self-signed K3s certs verify=False # MinIO often uses self-signed certs internally
) )
def list_files(self, bucket_name: str, prefix: str = "") -> list: def list_files(self, bucket_name: str, prefix: str = "") -> list:
"""Lists metadata for all files in a bucket/folder.""" """Lists files in a specific bucket, optionally filtered by a prefix folder."""
try: try:
response = self.s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix) response = self.s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
files = [] files = []
logger.info(f"Scanning bucket '{bucket_name}' with prefix '{prefix}'...")
if 'Contents' in response: if 'Contents' in response:
for obj in response['Contents']: for obj in response['Contents']:
files.append({ size_mb = obj['Size'] / 1024 / 1024
"key": obj['Key'], files.append({"key": obj['Key'], "size_mb": size_mb})
"size_mb": round(obj['Size'] / (1024 * 1024), 2), logger.info(f" - Found: {obj['Key']} ({size_mb:.2f} MB)")
"last_modified": obj['LastModified'] else:
}) logger.warning(f"No files found in bucket {bucket_name}.")
return files return files
except ClientError as e: except ClientError as e:
logger.error(f"Failed to list {bucket_name}: {e}") logger.error(f"Failed to list files in {bucket_name}: {e}")
return [] return []
def download_file(self, bucket_name: str, object_name: str, download_path: str) -> bool:
"""Downloads a file from MinIO to the local pod/container storage."""
try:
logger.info(f"Downloading {object_name} from {bucket_name} to {download_path}...")
self.s3_client.download_file(bucket_name, object_name, download_path)
logger.info("Download complete.")
return True
except ClientError as e:
logger.error(f"Error downloading {object_name}: {e}")
return False
def upload_file(self, file_path: str, bucket_name: str, object_name: str) -> bool:
"""Uploads a local file (like a trained model or prediction COG) back to MinIO."""
try:
logger.info(f"Uploading {file_path} to {bucket_name}/{object_name}...")
self.s3_client.upload_file(file_path, bucket_name, object_name)
logger.info("Upload complete.")
return True
except ClientError as e:
logger.error(f"Error uploading {file_path}: {e}")
return False
def load_dataset(self, bucket_name: str, object_name: str) -> pd.DataFrame: def load_dataset(self, bucket_name: str, object_name: str) -> pd.DataFrame:
""" """Loads a CSV dataset directly from MinIO into a Pandas DataFrame in memory."""
Reads a CSV directly from MinIO into a Pandas DataFrame without saving to disk.
Best for training on large batches.
"""
try: try:
logger.info(f"Streaming s3://{bucket_name}/{object_name} into memory...") logger.info(f"Loading {object_name} from {bucket_name} into memory...")
obj = self.s3_client.get_object(Bucket=bucket_name, Key=object_name) response = self.s3_client.get_object(Bucket=bucket_name, Key=object_name)
return pd.read_csv(BytesIO(obj['Body'].read())) df = pd.read_csv(io.BytesIO(response['Body'].read()))
logger.info(f"Successfully loaded dataset with shape: {df.shape}")
return df
except ClientError as e:
logger.error(f"Error loading {object_name}: {e}")
return None
except Exception as e: except Exception as e:
logger.error(f"Failed to load dataset {object_name}: {e}") logger.error(f"Error parsing {object_name} into DataFrame: {e}")
return pd.DataFrame() return None
def download_file(self, bucket_name: str, object_name: str, local_path: str): # ==========================================
"""Downloads a binary file (e.g., a TIFF baseline) to local storage.""" # Example Usage (For your Jupyter Notebooks)
try: # ==========================================
self.s3_client.download_file(bucket_name, object_name, local_path) if __name__ == "__main__":
logger.info(f"Successfully downloaded to {local_path}") storage = MinIOStorageClient()
except ClientError as e:
logger.error(f"Download failed: {e}")
def upload_artifact(self, local_path: str, bucket_name: str, object_name: str): # 1. List the Zimbabwe Augmented CSV batches
"""Uploads a local file (e.g., a trained model .pkl) back to MinIO.""" datasets_bucket = 'geocrop-datasets'
try: csv_files = storage.list_files(datasets_bucket)
self.s3_client.upload_file(local_path, bucket_name, object_name)
logger.info(f"Artifact uploaded to s3://{bucket_name}/{object_name}")
except ClientError as e:
logger.error(f"Upload failed: {e}")
def get_pandas_options(self): # 2. Load a batch directly into memory for training
"""Returns a dict to use with pd.read_csv('s3://...') if preferred.""" if csv_files:
return { first_batch_key = csv_files[0]['key']
"key": self.access_key, df = storage.load_dataset(datasets_bucket, first_batch_key)
"secret": self.secret_key, if df is not None:
"client_kwargs": {"endpoint_url": self.endpoint_url} print(df.info())
}