feat: Complete Smart Resume Formatter with R2 and Gemini AI integration

- Integrated Cloudflare R2 for template storage and converted file management - Added Google Gemini AI for resume parsing and HTML generation - Created backend API endpoints for templates, conversion, and history - Refactored frontend to use real API instead of mock data - Fixed Docker networking issues (IPv6/IPv4) for R2 connectivity - Added resumeService.ts for frontend API integration - Updated Vite configuration for proper asset serving in Docker - Successfully tested with 13 templates from R2 bucket
2025-10-14 21:43:41 +05:30
parent ee030b70bc
commit cda50356b4
34 changed files with 2604 additions and 360 deletions
--- a/backend/app/api/api.py
+++ b/backend/app/api/api.py
@@ -1,6 +1,7 @@
 from fastapi import APIRouter

-from app.api.endpoints import people
+from app.api.endpoints import people, resumes

 api_router = APIRouter()
 api_router.include_router(people.router, prefix="/people", tags=["people"])
+api_router.include_router(resumes.router, prefix="/resumes", tags=["resumes"])
--- a/backend/app/api/endpoints/resumes.py
+++ b/backend/app/api/endpoints/resumes.py
@@ -0,0 +1,179 @@
+from fastapi import APIRouter, UploadFile, File, Form, HTTPException
+from fastapi.responses import JSONResponse
+from typing import List, Dict, Optional
+import io
+
+from app.services.r2_service import r2_service
+from app.services.ai_service import ai_service
+
+router = APIRouter()
+
+
+@router.get("/templates", response_model=List[str])
+async def get_templates():
+    """
+    Get list of available resume templates from R2
+    """
+    try:
+        templates = r2_service.list_templates()
+        return templates
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to fetch templates: {str(e)}"
+        )
+
+
+@router.get("/templates/{template_name}")
+async def get_template_content(template_name: str):
+    """
+    Get the HTML content of a specific template
+    """
+    try:
+        content = r2_service.get_template_content(template_name)
+        if content is None:
+            raise HTTPException(
+                status_code=404,
+                detail=f"Template '{template_name}' not found"
+            )
+        return {"content": content}
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to fetch template content: {str(e)}"
+        )
+
+
+@router.post("/convert")
+async def convert_resume(
+    file: UploadFile = File(...),
+    template_name: str = Form(...)
+):
+    """
+    Convert a resume file using the specified template
+    1. Extract text from resume using Gemini AI
+    2. Get template content from R2
+    3. Generate formatted HTML using Gemini AI
+    4. Upload HTML and PDF to R2
+    5. Return URLs for download
+    """
+    try:
+        # Validate file type
+        allowed_types = [
+            'application/pdf',
+            'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
+        ]
+        if file.content_type not in allowed_types:
+            raise HTTPException(
+                status_code=400,
+                detail="Invalid file type. Only PDF and DOCX files are allowed."
+            )
+        
+        # Read file content
+        file_content = await file.read()
+        
+        # Step 1: Extract text from resume
+        resume_text = await ai_service.extract_text_from_resume(
+            file_content, 
+            file.content_type
+        )
+        if not resume_text:
+            raise HTTPException(
+                status_code=500,
+                detail="Failed to extract text from resume"
+            )
+        
+        # Step 2: Get template content
+        template_html = r2_service.get_template_content(template_name)
+        if not template_html:
+            raise HTTPException(
+                status_code=404,
+                detail=f"Template '{template_name}' not found"
+            )
+        
+        # Step 3: Generate formatted HTML
+        generated_html = await ai_service.generate_html_from_template(
+            resume_text,
+            template_html
+        )
+        if not generated_html:
+            raise HTTPException(
+                status_code=500,
+                detail="Failed to generate formatted HTML"
+            )
+        
+        # Step 4: Upload HTML to R2
+        base_filename = file.filename.rsplit('.', 1)[0]
+        html_filename = f"{base_filename}_{template_name}.html"
+        
+        html_url = r2_service.upload_converted_file(
+            generated_html.encode('utf-8'),
+            html_filename,
+            'text/html',
+            metadata={
+                'original_filename': file.filename,
+                'template': template_name
+            }
+        )
+        
+        if not html_url:
+            raise HTTPException(
+                status_code=500,
+                detail="Failed to upload HTML to storage"
+            )
+        
+        # Return response
+        return {
+            "success": True,
+            "html_url": html_url,
+            "html_content": generated_html,
+            "message": "Resume converted successfully"
+        }
+        
+    except HTTPException:
+        raise
+    except Exception as e:
+        print(f"Error converting resume: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"An error occurred during conversion: {str(e)}"
+        )
+
+
+@router.get("/history", response_model=List[Dict])
+async def get_conversion_history(limit: int = 50):
+    """
+    Get list of previously converted resumes from R2
+    """
+    try:
+        files = r2_service.list_converted_resumes(limit=limit)
+        return files
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to fetch conversion history: {str(e)}"
+        )
+
+
+@router.get("/download/{file_key:path}")
+async def get_download_url(file_key: str):
+    """
+    Get a presigned download URL for a file
+    """
+    try:
+        url = r2_service.get_file_url(file_key)
+        if not url:
+            raise HTTPException(
+                status_code=404,
+                detail="File not found"
+            )
+        return {"url": url}
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to generate download URL: {str(e)}"
+        )
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@@ -12,7 +12,7 @@ class Settings:
    """
    APP_NAME: str = os.getenv("APP_NAME", "ResumeFormatter")
    API_V1_STR: str = f"/{APP_NAME}/api"
-    PROJECT_NAME: str = "Profile Linker API"
+    PROJECT_NAME: str = "Smart Resume Formatter API"
    
    # CORS settings
    BACKEND_CORS_ORIGINS: List[str] = ["*"]
@@ -20,6 +20,15 @@ class Settings:
    # Database settings - using in-memory database by default
    # In a production environment, you would use a real database connection string
    DATABASE_URL: Optional[str] = None
+    
+    # Gemini AI settings
+    GEMINI_API_KEY: str = os.getenv("GEMINI_API_KEY", "")
+    
+    # Cloudflare R2 settings
+    R2_ENDPOINT: str = os.getenv("R2_ENDPOINT", "")
+    R2_ACCESS_KEY_ID: str = os.getenv("R2_ACCESS_KEY_ID", "")
+    R2_SECRET_ACCESS_KEY: str = os.getenv("R2_SECRET_ACCESS_KEY", "")
+    R2_BUCKET_NAME: str = os.getenv("R2_BUCKET_NAME", "e-teams")


 settings = Settings()
--- a/backend/app/services/init.py
+++ b/backend/app/services/init.py
@@ -0,0 +1 @@
+# Services module
--- a/backend/app/services/ai_service.py
+++ b/backend/app/services/ai_service.py
@@ -0,0 +1,135 @@
+"""
+Gemini AI Service
+Handles resume text extraction and HTML generation
+"""
+import google.generativeai as genai
+from typing import Optional
+import base64
+from app.core.config import settings
+
+
+class AIService:
+    """Service for interacting with Google Gemini AI"""
+    
+    def __init__(self):
+        """Initialize Gemini AI with API key"""
+        if not settings.GEMINI_API_KEY:
+            raise ValueError("GEMINI_API_KEY not configured")
+        genai.configure(api_key=settings.GEMINI_API_KEY)
+        self.model = genai.GenerativeModel('gemini-2.0-flash-exp')
+    
+    async def extract_text_from_resume(
+        self, 
+        file_content: bytes, 
+        mime_type: str
+    ) -> Optional[str]:
+        """
+        Extract text from resume file using Gemini Vision
+        Args:
+            file_content: File content as bytes
+            mime_type: MIME type of the file (application/pdf or application/vnd.openxmlformats-officedocument.wordprocessingml.document)
+        Returns: Extracted text or None if failed
+        """
+        try:
+            # Convert bytes to base64
+            base64_data = base64.b64encode(file_content).decode('utf-8')
+            
+            prompt = """Extract all text from this resume document. 
+Preserve the original structure, including sections, headings, bullet points, and line breaks, as plain text.
+Focus on maintaining the hierarchical structure of the content."""
+            
+            response = self.model.generate_content([
+                {
+                    'mime_type': mime_type,
+                    'data': base64_data
+                },
+                prompt
+            ])
+            
+            return response.text
+        except Exception as e:
+            print(f"Error extracting text from resume: {e}")
+            return None
+    
+    async def generate_html_from_template(
+        self, 
+        resume_text: str, 
+        template_html: str
+    ) -> Optional[str]:
+        """
+        Generate formatted HTML by merging resume content with template
+        Args:
+            resume_text: Extracted resume text
+            template_html: HTML template content
+        Returns: Generated HTML or None if failed
+        """
+        try:
+            prompt = self._build_generation_prompt(resume_text, template_html)
+            
+            response = self.model.generate_content(prompt)
+            
+            # Clean up the response (remove code blocks if present)
+            html_content = response.text.strip()
+            if html_content.startswith('```html'):
+                html_content = html_content[7:]  # Remove ```html
+            if html_content.endswith('```'):
+                html_content = html_content[:-3]  # Remove ```
+            
+            return html_content.strip()
+        except Exception as e:
+            print(f"Error generating HTML: {e}")
+            return None
+    
+    def _build_generation_prompt(self, resume_text: str, template_html: str) -> str:
+        """Build the prompt for HTML generation"""
+        instructions = """### 🎯 EXACT TEMPLATE PRESERVATION INSTRUCTIONS:
+
+**🚨 RULE #1: COPY TEMPLATE EXACTLY - NO STRUCTURAL CHANGES! 🚨**
+**🚨 RULE #2: ONLY REPLACE PLACEHOLDER TEXT - NOTHING ELSE! 🚨**
+
+**YOU ARE A FIND-AND-REPLACE TOOL - NOT A DESIGNER!**
+
+**SIMPLE 3-STEP PROCESS:**
+1. **COPY**: Take the entire HTML template (every character from <!DOCTYPE to </html>).
+2. **FIND**: Locate placeholder text in the template (like "{{name}}", "John Doe", "Software Engineer", "2020-2023", etc.).
+3. **REPLACE**: Replace ONLY that placeholder text with the user's corresponding information.
+
+**WHAT TO REPLACE:**
+- Names, contact info
+- Job titles, companies, dates, descriptions
+- Education details
+- Skills lists
+
+**WHAT TO NEVER CHANGE:**
+- HTML tags (div, p, h1, etc.), CSS classes, IDs, or any inline styles.
+- The overall HTML structure, layout, nesting, alignment, spacing, colors, and fonts.
+
+**FOR EXTRA USER CONTENT:**
+If the user's resume has sections not present in the template (e.g., 'Projects', 'Certifications'):
+- Find a similar section in the template (e.g., 'Experience').
+- Copy that section's HTML structure.
+- Add it at a logical place (usually at the end) with the user's content.
+- Reuse the same CSS classes and styling patterns to maintain consistency.
+
+**CRITICAL:** Ensure ALL information from the user's resume is included in the final HTML. Do not omit any details.
+"""
+        
+        return f"""You are an expert HTML resume generator. Your task is to take the user's resume content and perfectly merge it into the provided company HTML template by acting as a precise find-and-replace tool.
+
+**User's Resume Content:**
+---
+{resume_text}
+---
+
+**Company HTML Template:**
+---
+{template_html}
+---
+
+{instructions}
+
+Now, generate the final, complete HTML file. Your entire output must be only the HTML code, starting with `<!DOCTYPE html>` and ending with `</html>`. Do not include any explanations or surrounding text."""
+
+
+# Singleton instance
+ai_service = AIService()
--- a/backend/app/services/r2_service.py
+++ b/backend/app/services/r2_service.py
@@ -0,0 +1,203 @@
+"""
+Cloudflare R2 Storage Service
+Handles all interactions with R2 bucket for templates and converted resumes
+"""
+import boto3
+from botocore.client import Config
+from typing import List, Dict, Optional
+from datetime import datetime
+import io
+import socket
+from app.core.config import settings
+
+# Force IPv4 to avoid Docker IPv6 issues
+original_getaddrinfo = socket.getaddrinfo
+
+def getaddrinfo_ipv4_only(host, port, family=0, type=0, proto=0, flags=0):
+    """Force IPv4 resolution only"""
+    return original_getaddrinfo(host, port, socket.AF_INET, type, proto, flags)
+
+socket.getaddrinfo = getaddrinfo_ipv4_only
+
+
+class R2Service:
+    """Service for interacting with Cloudflare R2 storage"""
+    
+    def __init__(self):
+        """Initialize R2 client with credentials from settings"""
+        self.s3_client = boto3.client(
+            's3',
+            endpoint_url=settings.R2_ENDPOINT,
+            aws_access_key_id=settings.R2_ACCESS_KEY_ID,
+            aws_secret_access_key=settings.R2_SECRET_ACCESS_KEY,
+            config=Config(
+                signature_version='s3v4',
+                s3={'addressing_style': 'path'}
+            ),
+            region_name='auto'
+        )
+        self.bucket_name = settings.R2_BUCKET_NAME
+        self.templates_prefix = "templates/"
+        self.converted_prefix = "converted_resumes/"
+    
+    def list_templates(self) -> List[str]:
+        """
+        List all available template names from R2
+        Returns: List of template names (without .html extension)
+        """
+        try:
+            print(f"Attempting to list templates from bucket: {self.bucket_name}, prefix: {self.templates_prefix}")
+            print(f"Using endpoint: {settings.R2_ENDPOINT}")
+            
+            response = self.s3_client.list_objects_v2(
+                Bucket=self.bucket_name,
+                Prefix=self.templates_prefix
+            )
+            
+            print(f"R2 Response: {response}")
+            
+            if 'Contents' not in response:
+                print(f"No contents found in bucket with prefix {self.templates_prefix}")
+                return []
+            
+            templates = []
+            for obj in response['Contents']:
+                key = obj['Key']
+                # Extract template name (remove prefix and .html extension)
+                if key.endswith('.html'):
+                    template_name = key.replace(self.templates_prefix, '').replace('.html', '')
+                    if template_name:  # Skip if empty (i.e., if key was just the prefix)
+                        templates.append(template_name)
+            
+            print(f"Found templates: {templates}")
+            return templates
+        except Exception as e:
+            print(f"Error listing templates: {e}")
+            import traceback
+            traceback.print_exc()
+            return []
+    
+    def get_template_content(self, template_name: str) -> Optional[str]:
+        """
+        Get the HTML content of a specific template
+        Args:
+            template_name: Name of the template (without .html extension)
+        Returns: HTML content as string, or None if not found
+        """
+        try:
+            key = f"{self.templates_prefix}{template_name}.html"
+            response = self.s3_client.get_object(
+                Bucket=self.bucket_name,
+                Key=key
+            )
+            content = response['Body'].read().decode('utf-8')
+            return content
+        except Exception as e:
+            print(f"Error getting template content for {template_name}: {e}")
+            return None
+    
+    def upload_converted_file(
+        self, 
+        file_content: bytes, 
+        filename: str, 
+        content_type: str,
+        metadata: Optional[Dict[str, str]] = None
+    ) -> Optional[str]:
+        """
+        Upload a converted resume file to R2
+        Args:
+            file_content: File content as bytes
+            filename: Name of the file
+            content_type: MIME type (text/html or application/pdf)
+            metadata: Optional metadata dict
+        Returns: Public URL of uploaded file, or None if failed
+        """
+        try:
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            key = f"{self.converted_prefix}{timestamp}_{filename}"
+            
+            upload_args = {
+                'Bucket': self.bucket_name,
+                'Key': key,
+                'Body': file_content,
+                'ContentType': content_type
+            }
+            
+            if metadata:
+                upload_args['Metadata'] = metadata
+            
+            self.s3_client.put_object(**upload_args)
+            
+            # Generate public URL
+            url = f"{settings.R2_ENDPOINT}/{self.bucket_name}/{key}"
+            return url
+        except Exception as e:
+            print(f"Error uploading file {filename}: {e}")
+            return None
+    
+    def list_converted_resumes(self, limit: int = 50) -> List[Dict]:
+        """
+        List converted resumes from R2
+        Args:
+            limit: Maximum number of files to return
+        Returns: List of dicts with file metadata
+        """
+        try:
+            response = self.s3_client.list_objects_v2(
+                Bucket=self.bucket_name,
+                Prefix=self.converted_prefix,
+                MaxKeys=limit
+            )
+            
+            if 'Contents' not in response:
+                return []
+            
+            files = []
+            for obj in response['Contents']:
+                key = obj['Key']
+                # Skip directory markers
+                if key.endswith('/'):
+                    continue
+                
+                filename = key.replace(self.converted_prefix, '')
+                file_url = f"{settings.R2_ENDPOINT}/{self.bucket_name}/{key}"
+                
+                files.append({
+                    'id': key,
+                    'name': filename,
+                    'url': file_url,
+                    'size': obj['Size'],
+                    'lastModified': obj['LastModified'].isoformat(),
+                    'timestamp': obj['LastModified']
+                })
+            
+            # Sort by timestamp, newest first
+            files.sort(key=lambda x: x['timestamp'], reverse=True)
+            
+            return files
+        except Exception as e:
+            print(f"Error listing converted resumes: {e}")
+            return []
+    
+    def get_file_url(self, key: str, expires_in: int = 3600) -> Optional[str]:
+        """
+        Generate a presigned URL for a file
+        Args:
+            key: Object key in R2
+            expires_in: URL expiration time in seconds (default 1 hour)
+        Returns: Presigned URL or None if failed
+        """
+        try:
+            url = self.s3_client.generate_presigned_url(
+                'get_object',
+                Params={'Bucket': self.bucket_name, 'Key': key},
+                ExpiresIn=expires_in
+            )
+            return url
+        except Exception as e:
+            print(f"Error generating presigned URL for {key}: {e}")
+            return None
+
+
+# Singleton instance
+r2_service = R2Service()