import boto3 import os import logging import pandas as pd import io from botocore.exceptions import ClientError # Configure logging for the worker/training scripts logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) class MinIOStorageClient: """ A reusable client for interacting with the local MinIO storage. Handles Datasets (CSVs), Baselines (DW TIFFs), and Model Artifacts. """ def __init__(self): # Initialize S3 client using environment variables # Defaults to the internal Kubernetes DNS for MinIO if not provided self.endpoint_url = os.environ.get('AWS_S3_ENDPOINT_URL', 'http://minio.geocrop.svc.cluster.local:9000') self.s3_client = boto3.client( 's3', endpoint_url=self.endpoint_url, aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'), aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY'), config=boto3.session.Config(signature_version='s3v4'), verify=False # MinIO often uses self-signed certs internally ) def list_files(self, bucket_name: str, prefix: str = "") -> list: """Lists files in a specific bucket, optionally filtered by a prefix folder.""" try: response = self.s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix) files = [] logger.info(f"Scanning bucket '{bucket_name}' with prefix '{prefix}'...") if 'Contents' in response: for obj in response['Contents']: size_mb = obj['Size'] / 1024 / 1024 files.append({"key": obj['Key'], "size_mb": size_mb}) logger.info(f" - Found: {obj['Key']} ({size_mb:.2f} MB)") else: logger.warning(f"No files found in bucket {bucket_name}.") return files except ClientError as e: logger.error(f"Failed to list files in {bucket_name}: {e}") return [] def download_file(self, bucket_name: str, object_name: str, download_path: str) -> bool: """Downloads a file from MinIO to the local pod/container storage.""" try: logger.info(f"Downloading {object_name} from {bucket_name} to {download_path}...") self.s3_client.download_file(bucket_name, object_name, download_path) logger.info("Download complete.") return True except ClientError as e: logger.error(f"Error downloading {object_name}: {e}") return False def upload_file(self, file_path: str, bucket_name: str, object_name: str) -> bool: """Uploads a local file (like a trained model or prediction COG) back to MinIO.""" try: logger.info(f"Uploading {file_path} to {bucket_name}/{object_name}...") self.s3_client.upload_file(file_path, bucket_name, object_name) logger.info("Upload complete.") return True except ClientError as e: logger.error(f"Error uploading {file_path}: {e}") return False def load_dataset(self, bucket_name: str, object_name: str) -> pd.DataFrame: """Loads a CSV dataset directly from MinIO into a Pandas DataFrame in memory.""" try: logger.info(f"Loading {object_name} from {bucket_name} into memory...") response = self.s3_client.get_object(Bucket=bucket_name, Key=object_name) df = pd.read_csv(io.BytesIO(response['Body'].read())) logger.info(f"Successfully loaded dataset with shape: {df.shape}") return df except ClientError as e: logger.error(f"Error loading {object_name}: {e}") return None except Exception as e: logger.error(f"Error parsing {object_name} into DataFrame: {e}") return None # ========================================== # Example Usage (For your Jupyter Notebooks) # ========================================== if __name__ == "__main__": storage = MinIOStorageClient() # 1. List the Zimbabwe Augmented CSV batches datasets_bucket = 'geocrop-datasets' csv_files = storage.list_files(datasets_bucket) # 2. Load a batch directly into memory for training if csv_files: first_batch_key = csv_files[0]['key'] df = storage.load_dataset(datasets_bucket, first_batch_key) if df is not None: print(df.info())