Initial commit: File Transformer S3 project with React dashboard and Knative functions

This commit is contained in:
greg
2025-07-04 08:01:46 -07:00
commit fd9abd0210
54 changed files with 5584 additions and 0 deletions

29
functions/Makefile Normal file
View File

@@ -0,0 +1,29 @@
.PHONY: build build-upload build-transform build-download build-metadata clean
# Build all functions
build: build-upload build-transform build-download build-metadata
# Build upload function
build-upload:
@echo "Building upload function..."
@cd upload && docker build -t function-upload:latest .
# Build transform function
build-transform:
@echo "Building transform function..."
@cd transform && docker build -t function-transform:latest .
# Build download function
build-download:
@echo "Building download function..."
@cd download && docker build -t function-download:latest .
# Build metadata function
build-metadata:
@echo "Building metadata function..."
@cd metadata && docker build -t function-metadata:latest .
# Clean all function images
clean:
@echo "Cleaning function images..."
@docker rmi function-upload:latest function-transform:latest function-download:latest function-metadata:latest 2>/dev/null || true

View File

@@ -0,0 +1,30 @@
FROM python:3.11-slim
# Set working directory
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
gcc \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements and install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY app.py .
# Create non-root user
RUN useradd --create-home --shell /bin/bash app && chown -R app:app /app
USER app
# Expose port
EXPOSE 5000
# Health check
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
CMD curl -f http://localhost:5000/health || exit 1
# Run the application
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "4", "--timeout", "300", "app:app"]

201
functions/download/app.py Normal file
View File

@@ -0,0 +1,201 @@
import os
import logging
from datetime import datetime
from typing import Optional
from flask import Flask, request, jsonify, send_file
import psycopg2
from psycopg2.extras import RealDictCursor
from minio import Minio
from minio.error import S3Error
import structlog
import io
# Configure structured logging
structlog.configure(
processors=[
structlog.stdlib.filter_by_level,
structlog.stdlib.add_logger_name,
structlog.stdlib.add_log_level,
structlog.stdlib.PositionalArgumentsFormatter(),
structlog.processors.TimeStamper(fmt="iso"),
structlog.processors.StackInfoRenderer(),
structlog.processors.format_exc_info,
structlog.processors.UnicodeDecoder(),
structlog.processors.JSONRenderer()
],
context_class=dict,
logger_factory=structlog.stdlib.LoggerFactory(),
wrapper_class=structlog.stdlib.BoundLogger,
cache_logger_on_first_use=True,
)
logger = structlog.get_logger()
app = Flask(__name__)
# Configuration
MINIO_ENDPOINT = os.getenv('MINIO_ENDPOINT', 'localhost:9000')
MINIO_ACCESS_KEY = os.getenv('MINIO_ACCESS_KEY', 'minioadmin')
MINIO_SECRET_KEY = os.getenv('MINIO_SECRET_KEY', 'minioadmin123')
MINIO_BUCKET_NAME = os.getenv('MINIO_BUCKET_NAME', 'file-transformer-bucket')
MINIO_USE_SSL = os.getenv('MINIO_USE_SSL', 'false').lower() == 'true'
POSTGRES_URL = os.getenv('POSTGRES_URL', 'postgresql://file_user:secure_password_123@localhost:5432/file_transformer')
# Initialize MinIO client
minio_client = Minio(
MINIO_ENDPOINT,
access_key=MINIO_ACCESS_KEY,
secret_key=MINIO_SECRET_KEY,
secure=MINIO_USE_SSL
)
def get_db_connection():
"""Create a database connection."""
return psycopg2.connect(POSTGRES_URL)
def get_file_info(file_id: str) -> Optional[dict]:
"""Get file information from database."""
conn = get_db_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT id, filename, original_filename, file_size, file_type,
mime_type, bucket_name, object_key, status
FROM files
WHERE id = %s AND status != 'deleted'
""", (file_id,))
file_record = cur.fetchone()
return dict(file_record) if file_record else None
except Exception as e:
logger.error("Failed to get file info", error=str(e))
return None
finally:
conn.close()
def log_file_access(file_id: str, action: str, ip_address: str, user_agent: Optional[str]):
"""Log file access for audit purposes."""
conn = get_db_connection()
try:
with conn.cursor() as cur:
cur.execute("""
INSERT INTO file_access_logs (file_id, action, ip_address, user_agent)
VALUES (%s, %s, %s, %s)
""", (file_id, action, ip_address, user_agent))
conn.commit()
except Exception as e:
logger.error("Failed to log file access", error=str(e))
conn.rollback()
finally:
conn.close()
@app.route('/health', methods=['GET'])
def health_check():
"""Health check endpoint."""
return jsonify({'status': 'healthy', 'service': 'file-download'})
@app.route('/download/<file_id>', methods=['GET'])
def download_file(file_id: str):
"""Download file by ID."""
try:
# Get file information
file_info = get_file_info(file_id)
if not file_info:
return jsonify({'error': 'File not found'}), 404
if file_info['status'] == 'deleted':
return jsonify({'error': 'File has been deleted'}), 404
# Get file from MinIO
try:
response = minio_client.get_object(
file_info['bucket_name'],
file_info['object_key']
)
file_data = response.read()
response.close()
response.release_conn()
except S3Error as e:
logger.error("Failed to get file from MinIO", error=str(e))
return jsonify({'error': 'File not found in storage'}), 404
# Log access
log_file_access(file_id, 'download', request.remote_addr, request.headers.get('User-Agent'))
# Create file-like object for Flask to serve
file_stream = io.BytesIO(file_data)
file_stream.seek(0)
logger.info("File download completed",
file_id=file_id,
filename=file_info['filename'],
size=len(file_data))
return send_file(
file_stream,
mimetype=file_info['mime_type'],
as_attachment=True,
download_name=file_info['original_filename']
)
except Exception as e:
logger.error("Download error", error=str(e))
return jsonify({'error': 'Internal server error'}), 500
@app.route('/files/<file_id>/info', methods=['GET'])
def get_file_info_endpoint(file_id: str):
"""Get file information without downloading."""
try:
file_info = get_file_info(file_id)
if not file_info:
return jsonify({'error': 'File not found'}), 404
# Log access
log_file_access(file_id, 'view', request.remote_addr, request.headers.get('User-Agent'))
return jsonify(file_info), 200
except Exception as e:
logger.error("Error fetching file info", error=str(e))
return jsonify({'error': 'Internal server error'}), 500
@app.route('/files/<file_id>/url', methods=['GET'])
def get_download_url(file_id: str):
"""Get presigned download URL."""
try:
file_info = get_file_info(file_id)
if not file_info:
return jsonify({'error': 'File not found'}), 404
if file_info['status'] == 'deleted':
return jsonify({'error': 'File has been deleted'}), 404
# Generate presigned URL
try:
url = minio_client.presigned_get_object(
file_info['bucket_name'],
file_info['object_key'],
expires=3600 # 1 hour
)
except S3Error as e:
logger.error("Failed to generate presigned URL", error=str(e))
return jsonify({'error': 'Failed to generate download URL'}), 500
# Log access
log_file_access(file_id, 'url_generated', request.remote_addr, request.headers.get('User-Agent'))
return jsonify({
'file_id': file_id,
'filename': file_info['original_filename'],
'download_url': url,
'expires_in': 3600
}), 200
except Exception as e:
logger.error("Error generating download URL", error=str(e))
return jsonify({'error': 'Internal server error'}), 500
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000, debug=False)

View File

@@ -0,0 +1,24 @@
# Core dependencies
flask==2.3.3
gunicorn==21.2.0
python-dotenv==1.0.0
# Database
psycopg2-binary==2.9.7
sqlalchemy==2.0.21
# MinIO/S3
minio==7.1.17
boto3==1.28.44
# HTTP requests
requests==2.31.0
# JSON and data handling
pydantic==2.1.1
# Logging
structlog==23.1.0
# Utilities
python-dateutil==2.8.2

View File

@@ -0,0 +1,30 @@
FROM python:3.11-slim
# Set working directory
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
gcc \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements and install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY app.py .
# Create non-root user
RUN useradd --create-home --shell /bin/bash app && chown -R app:app /app
USER app
# Expose port
EXPOSE 5000
# Health check
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
CMD curl -f http://localhost:5000/health || exit 1
# Run the application
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "4", "--timeout", "300", "app:app"]

307
functions/metadata/app.py Normal file
View File

@@ -0,0 +1,307 @@
import os
import json
import logging
from datetime import datetime
from typing import Optional, Dict, Any
from flask import Flask, request, jsonify
import psycopg2
from psycopg2.extras import RealDictCursor
from minio import Minio
from minio.error import S3Error
import structlog
# Configure structured logging
structlog.configure(
processors=[
structlog.stdlib.filter_by_level,
structlog.stdlib.add_logger_name,
structlog.stdlib.add_log_level,
structlog.stdlib.PositionalArgumentsFormatter(),
structlog.processors.TimeStamper(fmt="iso"),
structlog.processors.StackInfoRenderer(),
structlog.processors.format_exc_info,
structlog.processors.UnicodeDecoder(),
structlog.processors.JSONRenderer()
],
context_class=dict,
logger_factory=structlog.stdlib.LoggerFactory(),
wrapper_class=structlog.stdlib.BoundLogger,
cache_logger_on_first_use=True,
)
logger = structlog.get_logger()
app = Flask(__name__)
# Configuration
MINIO_ENDPOINT = os.getenv('MINIO_ENDPOINT', 'localhost:9000')
MINIO_ACCESS_KEY = os.getenv('MINIO_ACCESS_KEY', 'minioadmin')
MINIO_SECRET_KEY = os.getenv('MINIO_SECRET_KEY', 'minioadmin123')
MINIO_BUCKET_NAME = os.getenv('MINIO_BUCKET_NAME', 'file-transformer-bucket')
MINIO_USE_SSL = os.getenv('MINIO_USE_SSL', 'false').lower() == 'true'
POSTGRES_URL = os.getenv('POSTGRES_URL', 'postgresql://file_user:secure_password_123@localhost:5432/file_transformer')
# Initialize MinIO client
minio_client = Minio(
MINIO_ENDPOINT,
access_key=MINIO_ACCESS_KEY,
secret_key=MINIO_SECRET_KEY,
secure=MINIO_USE_SSL
)
def get_db_connection():
"""Create a database connection."""
return psycopg2.connect(POSTGRES_URL)
def get_file_metadata(file_id: str) -> Optional[Dict[str, Any]]:
"""Get file metadata from database."""
conn = get_db_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT id, filename, original_filename, file_size, file_type,
mime_type, bucket_name, object_key, checksum, status,
transformation_type, transformation_config, metadata,
created_at, updated_at, processed_at
FROM files
WHERE id = %s
""", (file_id,))
file_record = cur.fetchone()
return dict(file_record) if file_record else None
except Exception as e:
logger.error("Failed to get file metadata", error=str(e))
return None
finally:
conn.close()
def get_file_transformations(file_id: str) -> list:
"""Get transformations for a file."""
conn = get_db_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT id, transformation_type, input_path, output_path,
status, config, result, error_message,
started_at, completed_at, created_at
FROM transformations
WHERE file_id = %s
ORDER BY created_at DESC
""", (file_id,))
transformations = cur.fetchall()
return [dict(t) for t in transformations]
except Exception as e:
logger.error("Failed to get file transformations", error=str(e))
return []
finally:
conn.close()
def get_file_access_logs(file_id: str, limit: int = 50) -> list:
"""Get access logs for a file."""
conn = get_db_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT id, action, ip_address, user_agent, created_at
FROM file_access_logs
WHERE file_id = %s
ORDER BY created_at DESC
LIMIT %s
""", (file_id, limit))
logs = cur.fetchall()
return [dict(log) for log in logs]
except Exception as e:
logger.error("Failed to get file access logs", error=str(e))
return []
finally:
conn.close()
def update_file_metadata(file_id: str, metadata: Dict[str, Any]) -> bool:
"""Update file metadata."""
conn = get_db_connection()
try:
with conn.cursor() as cur:
cur.execute("""
UPDATE files
SET metadata = %s, updated_at = %s
WHERE id = %s
""", (json.dumps(metadata), datetime.utcnow(), file_id))
conn.commit()
return True
except Exception as e:
logger.error("Failed to update file metadata", error=str(e))
conn.rollback()
return False
finally:
conn.close()
def get_storage_stats() -> Dict[str, Any]:
"""Get storage statistics."""
conn = get_db_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
# Total files and size
cur.execute("""
SELECT COUNT(*) as total_files,
SUM(file_size) as total_size,
COUNT(CASE WHEN status = 'uploaded' THEN 1 END) as uploaded_files,
COUNT(CASE WHEN status = 'processing' THEN 1 END) as processing_files,
COUNT(CASE WHEN status = 'transformed' THEN 1 END) as transformed_files,
COUNT(CASE WHEN status = 'error' THEN 1 END) as error_files,
COUNT(CASE WHEN status = 'deleted' THEN 1 END) as deleted_files
FROM files
""")
stats = cur.fetchone()
# File types distribution
cur.execute("""
SELECT file_type, COUNT(*) as count
FROM files
WHERE status != 'deleted'
GROUP BY file_type
ORDER BY count DESC
""")
file_types = cur.fetchall()
# Recent activity
cur.execute("""
SELECT COUNT(*) as recent_uploads
FROM files
WHERE created_at >= NOW() - INTERVAL '24 hours'
""")
recent = cur.fetchone()
return {
'stats': dict(stats),
'file_types': [dict(ft) for ft in file_types],
'recent_uploads': recent['recent_uploads'] if recent else 0
}
except Exception as e:
logger.error("Failed to get storage stats", error=str(e))
return {}
finally:
conn.close()
@app.route('/health', methods=['GET'])
def health_check():
"""Health check endpoint."""
return jsonify({'status': 'healthy', 'service': 'file-metadata'})
@app.route('/files/<file_id>/metadata', methods=['GET'])
def get_file_metadata_endpoint(file_id: str):
"""Get comprehensive file metadata."""
try:
# Get basic file metadata
file_metadata = get_file_metadata(file_id)
if not file_metadata:
return jsonify({'error': 'File not found'}), 404
# Get transformations
transformations = get_file_transformations(file_id)
# Get recent access logs
access_logs = get_file_access_logs(file_id, limit=10)
# Check if file exists in MinIO
minio_exists = False
try:
minio_client.stat_object(
file_metadata['bucket_name'],
file_metadata['object_key']
)
minio_exists = True
except S3Error:
minio_exists = False
response_data = {
'file': file_metadata,
'transformations': transformations,
'access_logs': access_logs,
'storage': {
'minio_exists': minio_exists,
'bucket': file_metadata['bucket_name'],
'object_key': file_metadata['object_key']
}
}
return jsonify(response_data), 200
except Exception as e:
logger.error("Error fetching file metadata", error=str(e))
return jsonify({'error': 'Internal server error'}), 500
@app.route('/files/<file_id>/metadata', methods=['PUT'])
def update_file_metadata_endpoint(file_id: str):
"""Update file metadata."""
try:
data = request.get_json()
if not data:
return jsonify({'error': 'No data provided'}), 400
# Check if file exists
file_metadata = get_file_metadata(file_id)
if not file_metadata:
return jsonify({'error': 'File not found'}), 404
# Update metadata
success = update_file_metadata(file_id, data)
if not success:
return jsonify({'error': 'Failed to update metadata'}), 500
logger.info("File metadata updated", file_id=file_id)
return jsonify({'message': 'Metadata updated successfully'}), 200
except Exception as e:
logger.error("Error updating file metadata", error=str(e))
return jsonify({'error': 'Internal server error'}), 500
@app.route('/files/<file_id>/transformations', methods=['GET'])
def get_file_transformations_endpoint(file_id: str):
"""Get transformations for a file."""
try:
# Check if file exists
file_metadata = get_file_metadata(file_id)
if not file_metadata:
return jsonify({'error': 'File not found'}), 404
transformations = get_file_transformations(file_id)
return jsonify(transformations), 200
except Exception as e:
logger.error("Error fetching file transformations", error=str(e))
return jsonify({'error': 'Internal server error'}), 500
@app.route('/files/<file_id>/access-logs', methods=['GET'])
def get_file_access_logs_endpoint(file_id: str):
"""Get access logs for a file."""
try:
# Check if file exists
file_metadata = get_file_metadata(file_id)
if not file_metadata:
return jsonify({'error': 'File not found'}), 404
limit = request.args.get('limit', 50, type=int)
access_logs = get_file_access_logs(file_id, limit=limit)
return jsonify(access_logs), 200
except Exception as e:
logger.error("Error fetching file access logs", error=str(e))
return jsonify({'error': 'Internal server error'}), 500
@app.route('/stats', methods=['GET'])
def get_stats_endpoint():
"""Get system statistics."""
try:
stats = get_storage_stats()
return jsonify(stats), 200
except Exception as e:
logger.error("Error fetching stats", error=str(e))
return jsonify({'error': 'Internal server error'}), 500
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000, debug=False)

View File

@@ -0,0 +1,24 @@
# Core dependencies
flask==2.3.3
gunicorn==21.2.0
python-dotenv==1.0.0
# Database
psycopg2-binary==2.9.7
sqlalchemy==2.0.21
# MinIO/S3
minio==7.1.17
boto3==1.28.44
# HTTP requests
requests==2.31.0
# JSON and data handling
pydantic==2.1.1
# Logging
structlog==23.1.0
# Utilities
python-dateutil==2.8.2

View File

@@ -0,0 +1,51 @@
# Core dependencies
flask==2.3.3
gunicorn==21.2.0
python-dotenv==1.0.0
# Database
psycopg2-binary==2.9.7
sqlalchemy==2.0.21
alembic==1.12.0
# MinIO/S3
minio==7.1.17
boto3==1.28.44
# File processing
python-magic==0.4.27
Pillow==10.0.1
PyPDF2==3.0.1
python-docx==0.8.11
openpyxl==3.1.2
pandas==2.0.3
numpy==1.24.3
# HTTP requests
requests==2.31.0
httpx==0.24.1
# JSON and data handling
pydantic==2.1.1
marshmallow==3.20.1
# Authentication and security
PyJWT==2.8.0
bcrypt==4.0.1
cryptography==41.0.4
# Logging and monitoring
structlog==23.1.0
prometheus-client==0.17.1
# Utilities
python-dateutil==2.8.2
pytz==2023.3
click==8.1.7
# Development and testing
pytest==7.4.2
pytest-cov==4.1.0
black==23.7.0
flake8==6.0.0
mypy==1.5.1

View File

@@ -0,0 +1,33 @@
FROM python:3.11-slim
# Set working directory
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
gcc \
libmagic1 \
libgl1-mesa-glx \
libglib2.0-0 \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements and install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY app.py .
# Create non-root user
RUN useradd --create-home --shell /bin/bash app && chown -R app:app /app
USER app
# Expose port
EXPOSE 5000
# Health check
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
CMD curl -f http://localhost:5000/health || exit 1
# Run the application
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "2", "--timeout", "600", "app:app"]

473
functions/transform/app.py Normal file
View File

@@ -0,0 +1,473 @@
import os
import uuid
import json
import tempfile
import logging
from datetime import datetime
from typing import Dict, Any, Optional, List
from pathlib import Path
from flask import Flask, request, jsonify
import psycopg2
from psycopg2.extras import RealDictCursor
from minio import Minio
from minio.error import S3Error
import structlog
# File processing imports
import PyPDF2
from docx import Document
import pandas as pd
from PIL import Image
import io
# Configure structured logging
structlog.configure(
processors=[
structlog.stdlib.filter_by_level,
structlog.stdlib.add_logger_name,
structlog.stdlib.add_log_level,
structlog.stdlib.PositionalArgumentsFormatter(),
structlog.processors.TimeStamper(fmt="iso"),
structlog.processors.StackInfoRenderer(),
structlog.processors.format_exc_info,
structlog.processors.UnicodeDecoder(),
structlog.processors.JSONRenderer()
],
context_class=dict,
logger_factory=structlog.stdlib.LoggerFactory(),
wrapper_class=structlog.stdlib.BoundLogger,
cache_logger_on_first_use=True,
)
logger = structlog.get_logger()
app = Flask(__name__)
# Configuration
MINIO_ENDPOINT = os.getenv('MINIO_ENDPOINT', 'localhost:9000')
MINIO_ACCESS_KEY = os.getenv('MINIO_ACCESS_KEY', 'minioadmin')
MINIO_SECRET_KEY = os.getenv('MINIO_SECRET_KEY', 'minioadmin123')
MINIO_BUCKET_NAME = os.getenv('MINIO_BUCKET_NAME', 'file-transformer-bucket')
MINIO_USE_SSL = os.getenv('MINIO_USE_SSL', 'false').lower() == 'true'
POSTGRES_URL = os.getenv('POSTGRES_URL', 'postgresql://file_user:secure_password_123@localhost:5432/file_transformer')
# Initialize MinIO client
minio_client = Minio(
MINIO_ENDPOINT,
access_key=MINIO_ACCESS_KEY,
secret_key=MINIO_SECRET_KEY,
secure=MINIO_USE_SSL
)
def get_db_connection():
"""Create a database connection."""
return psycopg2.connect(POSTGRES_URL)
def get_file_from_minio(object_key: str) -> bytes:
"""Download file from MinIO."""
try:
response = minio_client.get_object(MINIO_BUCKET_NAME, object_key)
return response.read()
except S3Error as e:
logger.error("Failed to get file from MinIO", object_key=object_key, error=str(e))
raise
def upload_file_to_minio(file_data: bytes, object_key: str) -> bool:
"""Upload file to MinIO."""
try:
minio_client.put_object(
MINIO_BUCKET_NAME,
object_key,
file_data,
length=len(file_data)
)
return True
except S3Error as e:
logger.error("Failed to upload file to MinIO", object_key=object_key, error=str(e))
return False
def extract_text_from_pdf(file_data: bytes) -> str:
"""Extract text from PDF file."""
try:
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_data))
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text.strip()
except Exception as e:
logger.error("PDF text extraction failed", error=str(e))
raise
def extract_text_from_docx(file_data: bytes) -> str:
"""Extract text from DOCX file."""
try:
doc = Document(io.BytesIO(file_data))
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text.strip()
except Exception as e:
logger.error("DOCX text extraction failed", error=str(e))
raise
def convert_csv_to_json(file_data: bytes) -> List[Dict[str, Any]]:
"""Convert CSV to JSON format."""
try:
df = pd.read_csv(io.BytesIO(file_data))
return df.to_dict('records')
except Exception as e:
logger.error("CSV to JSON conversion failed", error=str(e))
raise
def convert_excel_to_json(file_data: bytes) -> List[Dict[str, Any]]:
"""Convert Excel to JSON format."""
try:
df = pd.read_excel(io.BytesIO(file_data))
return df.to_dict('records')
except Exception as e:
logger.error("Excel to JSON conversion failed", error=str(e))
raise
def resize_image(file_data: bytes, width: int, height: int) -> bytes:
"""Resize image to specified dimensions."""
try:
image = Image.open(io.BytesIO(file_data))
resized_image = image.resize((width, height), Image.Resampling.LANCZOS)
output = io.BytesIO()
resized_image.save(output, format=image.format or 'JPEG')
return output.getvalue()
except Exception as e:
logger.error("Image resize failed", error=str(e))
raise
def convert_image_format(file_data: bytes, target_format: str) -> bytes:
"""Convert image to different format."""
try:
image = Image.open(io.BytesIO(file_data))
output = io.BytesIO()
image.save(output, format=target_format.upper())
return output.getvalue()
except Exception as e:
logger.error("Image format conversion failed", error=str(e))
raise
def create_transformation_record(file_id: str, transformation_type: str, config: Dict[str, Any]) -> str:
"""Create transformation record in database."""
conn = get_db_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
INSERT INTO transformations (
file_id, transformation_type, input_path, status, config, created_at
) VALUES (%s, %s, %s, %s, %s, %s)
RETURNING id
""", (
file_id,
transformation_type,
f"files/{file_id}",
'pending',
json.dumps(config),
datetime.utcnow()
))
transformation_id = cur.fetchone()['id']
conn.commit()
return str(transformation_id)
except Exception as e:
conn.rollback()
logger.error("Failed to create transformation record", error=str(e))
raise
finally:
conn.close()
def update_transformation_status(transformation_id: str, status: str, result: Optional[Dict[str, Any]] = None, error_message: Optional[str] = None):
"""Update transformation status in database."""
conn = get_db_connection()
try:
with conn.cursor() as cur:
if status == 'processing':
cur.execute("""
UPDATE transformations
SET status = %s, started_at = %s
WHERE id = %s
""", (status, datetime.utcnow(), transformation_id))
elif status == 'completed':
cur.execute("""
UPDATE transformations
SET status = %s, completed_at = %s, result = %s
WHERE id = %s
""", (status, datetime.utcnow(), json.dumps(result), transformation_id))
elif status == 'failed':
cur.execute("""
UPDATE transformations
SET status = %s, completed_at = %s, error_message = %s
WHERE id = %s
""", (status, datetime.utcnow(), error_message, transformation_id))
conn.commit()
except Exception as e:
conn.rollback()
logger.error("Failed to update transformation status", error=str(e))
raise
finally:
conn.close()
def get_file_info(file_id: str) -> Optional[Dict[str, Any]]:
"""Get file information from database."""
conn = get_db_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT id, filename, file_type, mime_type, object_key, status
FROM files
WHERE id = %s
""", (file_id,))
file_record = cur.fetchone()
return dict(file_record) if file_record else None
except Exception as e:
logger.error("Failed to get file info", error=str(e))
return None
finally:
conn.close()
@app.route('/health', methods=['GET'])
def health_check():
"""Health check endpoint."""
return jsonify({'status': 'healthy', 'service': 'file-transform'})
@app.route('/transform', methods=['POST'])
def transform_file():
"""Handle file transformation request."""
try:
data = request.get_json()
if not data:
return jsonify({'error': 'No data provided'}), 400
file_id = data.get('file_id')
transformation_type = data.get('transformation_type')
config = data.get('config', {})
if not file_id or not transformation_type:
return jsonify({'error': 'file_id and transformation_type are required'}), 400
# Get file information
file_info = get_file_info(file_id)
if not file_info:
return jsonify({'error': 'File not found'}), 404
if file_info['status'] == 'deleted':
return jsonify({'error': 'File has been deleted'}), 400
# Create transformation record
transformation_id = create_transformation_record(file_id, transformation_type, config)
# Update status to processing
update_transformation_status(transformation_id, 'processing')
logger.info("Starting transformation",
file_id=file_id,
transformation_id=transformation_id,
transformation_type=transformation_type)
try:
# Get file from MinIO
file_data = get_file_from_minio(file_info['object_key'])
# Perform transformation based on type
result = None
output_data = None
if transformation_type == 'extract_text':
if file_info['file_type'] == 'pdf':
result = extract_text_from_pdf(file_data)
elif file_info['file_type'] in ['docx', 'doc']:
result = extract_text_from_docx(file_data)
else:
raise ValueError(f"Text extraction not supported for file type: {file_info['file_type']}")
# Save extracted text as new file
output_filename = f"{Path(file_info['filename']).stem}_extracted.txt"
output_object_key = f"transformations/{transformation_id}/{output_filename}"
output_data = result.encode('utf-8')
elif transformation_type == 'csv_to_json':
if file_info['file_type'] != 'csv':
raise ValueError("CSV to JSON conversion only supports CSV files")
result = convert_csv_to_json(file_data)
output_filename = f"{Path(file_info['filename']).stem}.json"
output_object_key = f"transformations/{transformation_id}/{output_filename}"
output_data = json.dumps(result, indent=2).encode('utf-8')
elif transformation_type == 'excel_to_json':
if file_info['file_type'] not in ['xlsx', 'xls']:
raise ValueError("Excel to JSON conversion only supports Excel files")
result = convert_excel_to_json(file_data)
output_filename = f"{Path(file_info['filename']).stem}.json"
output_object_key = f"transformations/{transformation_id}/{output_filename}"
output_data = json.dumps(result, indent=2).encode('utf-8')
elif transformation_type == 'resize_image':
if not file_info['mime_type'].startswith('image/'):
raise ValueError("Image resize only supports image files")
width = config.get('width', 800)
height = config.get('height', 600)
output_data = resize_image(file_data, width, height)
output_filename = f"{Path(file_info['filename']).stem}_resized.{Path(file_info['filename']).suffix}"
output_object_key = f"transformations/{transformation_id}/{output_filename}"
elif transformation_type == 'convert_image':
if not file_info['mime_type'].startswith('image/'):
raise ValueError("Image conversion only supports image files")
target_format = config.get('format', 'JPEG')
output_data = convert_image_format(file_data, target_format)
output_filename = f"{Path(file_info['filename']).stem}.{target_format.lower()}"
output_object_key = f"transformations/{transformation_id}/{output_filename}"
else:
raise ValueError(f"Unsupported transformation type: {transformation_type}")
# Upload transformed file to MinIO
if output_data:
if not upload_file_to_minio(output_data, output_object_key):
raise Exception("Failed to upload transformed file")
# Update transformation as completed
update_transformation_status(transformation_id, 'completed', {
'output_object_key': output_object_key,
'output_filename': output_filename,
'result': result if isinstance(result, (str, list, dict)) else None
})
# Update file status
conn = get_db_connection()
with conn.cursor() as cur:
cur.execute("""
UPDATE files
SET status = 'transformed', transformation_type = %s, processed_at = %s
WHERE id = %s
""", (transformation_type, datetime.utcnow(), file_id))
conn.commit()
conn.close()
response_data = {
'transformation_id': transformation_id,
'file_id': file_id,
'transformation_type': transformation_type,
'status': 'completed',
'output_object_key': output_object_key,
'output_filename': output_filename,
'completed_at': datetime.utcnow().isoformat()
}
logger.info("Transformation completed",
transformation_id=transformation_id,
file_id=file_id)
return jsonify(response_data), 200
except Exception as e:
error_message = str(e)
logger.error("Transformation failed",
transformation_id=transformation_id,
file_id=file_id,
error=error_message)
# Update transformation as failed
update_transformation_status(transformation_id, 'failed', error_message=error_message)
return jsonify({
'transformation_id': transformation_id,
'file_id': file_id,
'status': 'failed',
'error': error_message
}), 500
except Exception as e:
logger.error("Transform request error", error=str(e))
return jsonify({'error': 'Internal server error'}), 500
@app.route('/transformations/<transformation_id>', methods=['GET'])
def get_transformation_status(transformation_id: str):
"""Get transformation status and details."""
try:
conn = get_db_connection()
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT id, file_id, transformation_type, input_path, output_path,
status, config, result, error_message, started_at, completed_at, created_at
FROM transformations
WHERE id = %s
""", (transformation_id,))
transformation = cur.fetchone()
if not transformation:
return jsonify({'error': 'Transformation not found'}), 404
return jsonify(dict(transformation)), 200
except Exception as e:
logger.error("Error fetching transformation", error=str(e))
return jsonify({'error': 'Internal server error'}), 500
finally:
conn.close()
@app.route('/transformations/<transformation_id>/retry', methods=['POST'])
def retry_transformation(transformation_id: str):
"""Retry a failed transformation."""
try:
conn = get_db_connection()
with conn.cursor(cursor_factory=RealDictCursor) as cur:
# Get transformation details
cur.execute("""
SELECT file_id, transformation_type, config
FROM transformations
WHERE id = %s
""", (transformation_id,))
transformation = cur.fetchone()
if not transformation:
return jsonify({'error': 'Transformation not found'}), 404
if transformation['status'] != 'failed':
return jsonify({'error': 'Only failed transformations can be retried'}), 400
# Reset transformation status
cur.execute("""
UPDATE transformations
SET status = 'pending', started_at = NULL, completed_at = NULL,
error_message = NULL, result = NULL
WHERE id = %s
""", (transformation_id,))
conn.commit()
# Trigger new transformation
transform_data = {
'file_id': transformation['file_id'],
'transformation_type': transformation['transformation_type'],
'config': transformation['config'] or {}
}
# Call transform endpoint internally
with app.test_client() as client:
response = client.post('/transform', json=transform_data)
return response.get_json(), response.status_code
except Exception as e:
logger.error("Error retrying transformation", error=str(e))
return jsonify({'error': 'Internal server error'}), 500
finally:
conn.close()
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000, debug=False)

View File

@@ -0,0 +1,33 @@
# Core dependencies
flask==2.3.3
gunicorn==21.2.0
python-dotenv==1.0.0
# Database
psycopg2-binary==2.9.7
sqlalchemy==2.0.21
# MinIO/S3
minio==7.1.17
boto3==1.28.44
# File processing
python-magic==0.4.27
Pillow==10.0.1
PyPDF2==3.0.1
python-docx==0.8.11
openpyxl==3.1.2
pandas==2.0.3
numpy==1.24.3
# HTTP requests
requests==2.31.0
# JSON and data handling
pydantic==2.1.1
# Logging
structlog==23.1.0
# Utilities
python-dateutil==2.8.2

View File

@@ -0,0 +1,31 @@
FROM python:3.11-slim
# Set working directory
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
gcc \
libmagic1 \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements and install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY app.py .
# Create non-root user
RUN useradd --create-home --shell /bin/bash app && chown -R app:app /app
USER app
# Expose port
EXPOSE 5000
# Health check
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
CMD curl -f http://localhost:5000/health || exit 1
# Run the application
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "4", "--timeout", "300", "app:app"]

287
functions/upload/app.py Normal file
View File

@@ -0,0 +1,287 @@
import os
import uuid
import hashlib
import magic
import logging
from datetime import datetime
from typing import Dict, Any, Optional
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
import psycopg2
from psycopg2.extras import RealDictCursor
from minio import Minio
from minio.error import S3Error
import structlog
# Configure structured logging
structlog.configure(
processors=[
structlog.stdlib.filter_by_level,
structlog.stdlib.add_logger_name,
structlog.stdlib.add_log_level,
structlog.stdlib.PositionalArgumentsFormatter(),
structlog.processors.TimeStamper(fmt="iso"),
structlog.processors.StackInfoRenderer(),
structlog.processors.format_exc_info,
structlog.processors.UnicodeDecoder(),
structlog.processors.JSONRenderer()
],
context_class=dict,
logger_factory=structlog.stdlib.LoggerFactory(),
wrapper_class=structlog.stdlib.BoundLogger,
cache_logger_on_first_use=True,
)
logger = structlog.get_logger()
app = Flask(__name__)
# Configuration
MINIO_ENDPOINT = os.getenv('MINIO_ENDPOINT', 'localhost:9000')
MINIO_ACCESS_KEY = os.getenv('MINIO_ACCESS_KEY', 'minioadmin')
MINIO_SECRET_KEY = os.getenv('MINIO_SECRET_KEY', 'minioadmin123')
MINIO_BUCKET_NAME = os.getenv('MINIO_BUCKET_NAME', 'file-transformer-bucket')
MINIO_USE_SSL = os.getenv('MINIO_USE_SSL', 'false').lower() == 'true'
POSTGRES_URL = os.getenv('POSTGRES_URL', 'postgresql://file_user:secure_password_123@localhost:5432/file_transformer')
# Initialize MinIO client
minio_client = Minio(
MINIO_ENDPOINT,
access_key=MINIO_ACCESS_KEY,
secret_key=MINIO_SECRET_KEY,
secure=MINIO_USE_SSL
)
def get_db_connection():
"""Create a database connection."""
return psycopg2.connect(POSTGRES_URL)
def calculate_file_hash(file_data: bytes) -> str:
"""Calculate SHA-256 hash of file data."""
return hashlib.sha256(file_data).hexdigest()
def get_file_metadata(file_data: bytes, filename: str) -> Dict[str, Any]:
"""Extract file metadata including MIME type and size."""
mime_type = magic.from_buffer(file_data, mime=True)
file_size = len(file_data)
# Determine file type from extension
file_extension = filename.rsplit('.', 1)[1].lower() if '.' in filename else ''
return {
'mime_type': mime_type,
'file_size': file_size,
'file_type': file_extension,
'checksum': calculate_file_hash(file_data)
}
def save_file_to_database(file_data: bytes, filename: str, object_key: str, metadata: Dict[str, Any]) -> str:
"""Save file information to PostgreSQL database."""
conn = get_db_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
# Insert file record
cur.execute("""
INSERT INTO files (
filename, original_filename, file_path, file_size,
file_type, mime_type, bucket_name, object_key,
checksum, status, created_at
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
RETURNING id
""", (
filename,
filename,
object_key,
metadata['file_size'],
metadata['file_type'],
metadata['mime_type'],
MINIO_BUCKET_NAME,
object_key,
metadata['checksum'],
'uploaded',
datetime.utcnow()
))
file_id = cur.fetchone()['id']
conn.commit()
return str(file_id)
except Exception as e:
conn.rollback()
logger.error("Database error", error=str(e))
raise
finally:
conn.close()
def upload_to_minio(file_data: bytes, object_key: str) -> bool:
"""Upload file to MinIO bucket."""
try:
# Ensure bucket exists
if not minio_client.bucket_exists(MINIO_BUCKET_NAME):
minio_client.make_bucket(MINIO_BUCKET_NAME)
logger.info("Created bucket", bucket=MINIO_BUCKET_NAME)
# Upload file
minio_client.put_object(
MINIO_BUCKET_NAME,
object_key,
file_data,
length=len(file_data)
)
logger.info("File uploaded to MinIO", bucket=MINIO_BUCKET_NAME, object_key=object_key)
return True
except S3Error as e:
logger.error("MinIO upload error", error=str(e))
return False
@app.route('/health', methods=['GET'])
def health_check():
"""Health check endpoint."""
return jsonify({'status': 'healthy', 'service': 'file-upload'})
@app.route('/upload', methods=['POST'])
def upload_file():
"""Handle file upload request."""
try:
# Check if file is present in request
if 'file' not in request.files:
return jsonify({'error': 'No file provided'}), 400
file = request.files['file']
if file.filename == '':
return jsonify({'error': 'No file selected'}), 400
# Read file data
file_data = file.read()
if not file_data:
return jsonify({'error': 'Empty file'}), 400
# Secure filename and generate object key
filename = secure_filename(file.filename)
file_id = str(uuid.uuid4())
object_key = f"uploads/{file_id}/{filename}"
# Extract metadata
metadata = get_file_metadata(file_data, filename)
logger.info("Processing file upload",
filename=filename,
size=metadata['file_size'],
mime_type=metadata['mime_type'])
# Upload to MinIO
if not upload_to_minio(file_data, object_key):
return jsonify({'error': 'Failed to upload file to storage'}), 500
# Save to database
db_file_id = save_file_to_database(file_data, filename, object_key, metadata)
# Log access
log_file_access(db_file_id, 'upload', request.remote_addr, request.headers.get('User-Agent'))
response_data = {
'file_id': db_file_id,
'filename': filename,
'object_key': object_key,
'file_size': metadata['file_size'],
'mime_type': metadata['mime_type'],
'checksum': metadata['checksum'],
'status': 'uploaded',
'uploaded_at': datetime.utcnow().isoformat()
}
logger.info("File upload completed", file_id=db_file_id, filename=filename)
return jsonify(response_data), 201
except Exception as e:
logger.error("Upload error", error=str(e))
return jsonify({'error': 'Internal server error'}), 500
def log_file_access(file_id: str, action: str, ip_address: str, user_agent: Optional[str]):
"""Log file access for audit purposes."""
conn = get_db_connection()
try:
with conn.cursor() as cur:
cur.execute("""
INSERT INTO file_access_logs (file_id, action, ip_address, user_agent)
VALUES (%s, %s, %s, %s)
""", (file_id, action, ip_address, user_agent))
conn.commit()
except Exception as e:
logger.error("Failed to log file access", error=str(e))
conn.rollback()
finally:
conn.close()
@app.route('/files/<file_id>', methods=['GET'])
def get_file_info(file_id: str):
"""Get file information by ID."""
try:
conn = get_db_connection()
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT id, filename, original_filename, file_size, file_type,
mime_type, bucket_name, object_key, checksum, status,
created_at, updated_at
FROM files
WHERE id = %s
""", (file_id,))
file_record = cur.fetchone()
if not file_record:
return jsonify({'error': 'File not found'}), 404
return jsonify(dict(file_record)), 200
except Exception as e:
logger.error("Error fetching file info", error=str(e))
return jsonify({'error': 'Internal server error'}), 500
finally:
conn.close()
@app.route('/files/<file_id>', methods=['DELETE'])
def delete_file(file_id: str):
"""Delete file from storage and database."""
try:
conn = get_db_connection()
with conn.cursor(cursor_factory=RealDictCursor) as cur:
# Get file info
cur.execute("SELECT object_key FROM files WHERE id = %s", (file_id,))
file_record = cur.fetchone()
if not file_record:
return jsonify({'error': 'File not found'}), 404
object_key = file_record['object_key']
# Delete from MinIO
try:
minio_client.remove_object(MINIO_BUCKET_NAME, object_key)
logger.info("File deleted from MinIO", object_key=object_key)
except S3Error as e:
logger.warning("File not found in MinIO", object_key=object_key, error=str(e))
# Mark as deleted in database
cur.execute("""
UPDATE files
SET status = 'deleted', deleted_at = %s
WHERE id = %s
""", (datetime.utcnow(), file_id))
conn.commit()
# Log access
log_file_access(file_id, 'delete', request.remote_addr, request.headers.get('User-Agent'))
return jsonify({'message': 'File deleted successfully'}), 200
except Exception as e:
logger.error("Error deleting file", error=str(e))
return jsonify({'error': 'Internal server error'}), 500
finally:
conn.close()
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000, debug=False)

View File

@@ -0,0 +1,27 @@
# Core dependencies
flask==2.3.3
gunicorn==21.2.0
python-dotenv==1.0.0
# Database
psycopg2-binary==2.9.7
sqlalchemy==2.0.21
# MinIO/S3
minio==7.1.17
boto3==1.28.44
# File processing
python-magic==0.4.27
# HTTP requests
requests==2.31.0
# JSON and data handling
pydantic==2.1.1
# Logging
structlog==23.1.0
# Utilities
python-dateutil==2.8.2