import asyncio
from fastapi import FastAPI, HTTPException, BackgroundTasks, Body
from pydantic import BaseModel
from fastapi import BackgroundTasks
# Conditional import for legacy Faiss functions
try:
    import faiss
    FAISS_AVAILABLE = True
except ImportError:
    FAISS_AVAILABLE = False
    print("[INFO] Faiss not available - using Milvus only")
import numpy as np
from langchain_openai import OpenAIEmbeddings
import os, time, json
import pymysql
import pandas as pd
from decouple import config
import warnings
from decouple import AutoConfig
from score_algo import extract_resume_text,compare_resume_with_jd_text
import json
import logging
from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from prompt_template import base_prompt,base_batch_prompt,matched_score_llm,translate_audio_file,prompt_analysis,safe_parse_json, generate_interview_questions,get_groq_llm ,safe_parse_batch_json
import spacy
from rapidfuzz import fuzz
from pathlib import Path
from typing import List,Optional,Union
from pymilvus import connections, db, Collection, CollectionSchema, FieldSchema, DataType, utility,AnnSearchRequest, RRFRanker
from openai import AsyncOpenAI
from datetime import datetime, timedelta
import re
import unicodedata




BASE_DIR = Path(__file__).resolve().parent
UPLOAD_DIR = BASE_DIR / "uploaded_videos"
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
DEEPGRAM_API_KEY = config("DEEPGRAM_API_KEY")

        # Load SpaCy model once
# NLP = spacy.load("en_core_web_lg")

config = AutoConfig()


warnings.simplefilter("ignore")

# # Load configuration
API_HOST = config('API_HOST')
API_PORT = config('API_PORT')
HOST = config('HOST_1')
PORT = config('PORT')
USER = config('USER_1')
DB = config('DB')
PASSWORD = config('PASSWARD')
OPENAI_API_KEY = config('OPENAI_API_KEY')
GEMINI_API_KEY = config('GEMINI_API_KEY')
LLM_MODEL = config('LLM_MODEL')
MILVUS_DB_NAME = config('MILVUS_DB_NAME')

CANDIDATE_RECOMMANDATION_COLLECTION_NAME = config('CANDIDATE_RECOMMANDATION_COLLECTION_NAME')
JOB_RECOMMANDATION_COLLECTION_NAME = config('JOB_RECOMMANDATION_COLLECTION_NAME')

Groq_API_KEY = config('Groq_API_KEY')

CANDIDATE_SCORE_BATCH_SIZE = config("CANDIDATE_SCORE_BATCH_SIZE")

print("@@@",API_HOST,API_PORT)
# # Initialize FastAPI
app = FastAPI()



# # Set up database connection
def get_db_connection():
    return pymysql.connect(
        host=HOST,
        port=3306,
        user=USER, 
        db=DB, 
        passwd=PASSWORD,
        autocommit=True
    )
conne = get_db_connection()
dvc=pd.read_sql_query ('''SELECT candidate_id,parsed_data FROM candidate_details''', conne)

# Initialize embeddings
embeddings = OpenAIEmbeddings(
    model="text-embedding-ada-002", api_key=OPENAI_API_KEY)

# Generate timestamp for unique filenames
output_dir = f"{os.getcwd()}/vector_DB"
os.makedirs(output_dir, exist_ok=True)

# Global variables to track current vector store
current_index_path = None
current_mapping_path = None
candidate_id_map = {}
job_id_map = {}
last_update_time = 0

class CustomJSONOutputParser:
    def parse(self, text: str) -> dict:
        try:
            # Step 1: Remove extra outer quotes if present
            if text.startswith('"') and text.endswith('"'):
                text = text[1:-1]

            # Step 2: Replace escape sequences
            text = text.encode('utf-8').decode('unicode_escape')
            # Step 3: Locate JSON object boundaries
            json_start, json_end = text.find('{'), text.rfind('}') + 1
            if json_start == -1 or json_end == -1:
                raise ValueError("No valid JSON structure found in string.")
            
            json_str = text[json_start:json_end]

            # Step 4: Load into dict
            return json.loads(json_str)

        except (json.JSONDecodeError, UnicodeDecodeError, ValueError) as e:
            raise ValueError(f"[ParserError] Failed to parse JSON: {e}")


# Instantiate parser once for performance and clarity
parser = CustomJSONOutputParser()

def clean_list(input_list):
    parsed_list = []
    for item in input_list:
        if item is None or (isinstance(item, str) and item.strip().upper() in ['NA', 'N/A']):
            parsed_list.append("none")
        else:
            try:
                parsed_item = parser.parse(item)
                parsed_list.append(parsed_item)
            except ValueError as e:
                # Log or handle the exception accordingly—here we use a fallback
                print(f"[WARN] Skipping invalid JSON: {e}")
                parsed_list.append(item)
    return parsed_list



KEYS_OF_INTEREST = ["summary","languages","sectors","keywords","experience", "skills", "projects", "certifications", "education","additional_security_analysis"]

def extract_relevant_fields(parsed_json: dict) -> dict:
    """
    Filters out only the keys of interest from a parsed JSON dict.
    """
    extracted = {}
    for key in KEYS_OF_INTEREST:
        value = parsed_json.get(key, None)
        if value is None:
            print(f"[INFO] Key '{key}' not found in parsed record — assigning None")
        extracted[key] = value
    return extracted


def get_latest_files():
    global current_index_path, current_mapping_path
    
    vector_files = [f for f in os.listdir(output_dir) if f.startswith("faiss_index_candidates_")]
    mapping_files = [f for f in os.listdir(output_dir) if f.startswith("candidate_id_map_")]
    
    if vector_files and mapping_files:
        latest_vector = sorted(vector_files)[-1]
        latest_mapping = sorted(mapping_files)[-1]
        current_index_path = os.path.join(output_dir, latest_vector)
        current_mapping_path = os.path.join(output_dir, latest_mapping)
        return True
    return False

def load_candidate_mapping():
    global candidate_id_map
    
    if current_mapping_path and os.path.exists(current_mapping_path):
        with open(current_mapping_path, 'r') as f:
            candidate_id_map = json.load(f)
        return True
    return False

def get_cleaned_candidate_text(candidate_id):
    conn = get_db_connection()
    try:
        query = """
        SELECT 
            u.headline,
            u.about,
            GROUP_CONCAT(DISTINCT s.sector_name) AS sectors,
            GROUP_CONCAT(DISTINCT k.title) AS keywords,
            GROUP_CONCAT(DISTINCT sk.title) AS skills,
            GROUP_CONCAT(DISTINCT l.title) AS languages,
            GROUP_CONCAT(DISTINCT CONCAT_WS('||', up.title, up.description, up.technologies) SEPARATOR '%%') AS projects,
            GROUP_CONCAT(DISTINCT CONCAT_WS('||', uc.name, uc.authority) SEPARATOR '%%') AS certifications
        FROM 
            candidate_details cd
        LEFT JOIN users u ON cd.candidate_id = u.id
        LEFT JOIN user_sectors us ON cd.candidate_id = us.user_id
        LEFT JOIN sectors s ON us.sector_id = s.id
        LEFT JOIN user_keywords uk ON cd.candidate_id = uk.user_id
        LEFT JOIN keywords k ON uk.keyword_id = k.id
        LEFT JOIN user_skills usk ON cd.candidate_id = usk.user_id
        LEFT JOIN skills sk ON usk.skill_id = sk.id
        LEFT JOIN user_languages ul ON cd.candidate_id = ul.user_id
        LEFT JOIN languages l ON ul.language_id = l.id
        LEFT JOIN user_projects up ON cd.candidate_id = up.user_id
        LEFT JOIN user_certifications uc ON cd.candidate_id = uc.user_id
        WHERE 
            cd.candidate_id = %s
        GROUP BY 
            u.headline, u.about
        """

        df = pd.read_sql_query(query, conn, params=[candidate_id])
        if df.empty:
            raise HTTPException(status_code=404, detail=f"Candidate ID {candidate_id} not found in DB.")
        # breakpoint()
        row = df.iloc[0]
        candidate_text_parts = [str(row[col]) for col in df.columns if pd.notnull(row[col])]
        candidate_text = " ".join(candidate_text_parts)
        
        cleaned_candidate_text = clean_resume_text(candidate_text)
        return cleaned_candidate_text
        
    finally:
        conn.close()

# def create_initial_vectorstore():
#     global current_index_path, current_mapping_path, candidate_id_map, last_update_time
    
#     conn = get_db_connection()
    
#     try:
#         # Fetch all candidate data
#         dvc = pd.read_sql_query('''SELECT candidate_id, parsed_data FROM candidate_details''', conn)
#         list_candidate_ids = dvc['candidate_id'].to_list()
#         list_parsed_data = dvc['parsed_data'].to_list()
#         print(f"Creating initial vector store with {len(list_candidate_ids)} candidates")
        
#         # Clean data
#         cleaned_parsed_data = clean_list(list_parsed_data)
#         final_candidate_list = []
#         for idx, resume_json in enumerate(cleaned_parsed_data):
#             if isinstance(resume_json, dict):
#                 extracted = extract_relevant_fields(resume_json)
#                 final_candidate_list.append(str(extracted))
#             else:
#                 print(f"[WARN] Resume at index {idx} skipped due to invalid data format.")

#         print("candidate_data_len",len(final_candidate_list))
#         # Create mapping
#         candidate_id_map = {str(i): str(cid) for i, cid in enumerate(list_candidate_ids)}
        
#         # Generate embeddings
#         candidate_embeddings = embeddings.embed_documents(final_candidate_list)
#         embedding_matrix = np.array(candidate_embeddings, dtype=np.float32)
#         dimension = embedding_matrix.shape[1]
        
#         # Create FAISS index
#         index = faiss.IndexFlatIP(dimension)
#         index.add(embedding_matrix)
        
#         # Save everything
#         timestr = time.strftime("%Y%m%d-%H%M%S")
#         current_index_path = os.path.join(output_dir, f"faiss_index_candidates_{timestr}")
#         current_mapping_path = os.path.join(output_dir, f"candidate_id_map_{timestr}.json")
        
#         faiss.write_index(index, current_index_path)
#         with open(current_mapping_path, 'w') as f:
#             json.dump(candidate_id_map, f)
            
#         last_update_time = time.time()
        
#         print(f"Initial vector store created: {current_index_path}")
#         return index
#     finally:
#         conn.close()

def create_initial_vectorstore():
    global current_index_path, current_mapping_path, candidate_id_map, last_update_time

    conn = get_db_connection()

    try:
        # Fetch all candidate IDs
        df_ids = pd.read_sql_query('''SELECT DISTINCT candidate_id FROM candidate_details''', conn)
        all_candidate_ids = df_ids['candidate_id'].tolist()
        print(f"Creating initial vector store with {len(all_candidate_ids)} candidates")

        list_candidate_ids = []
        final_candidate_texts = []

        # Process each candidate one by one
        for candidate_id in all_candidate_ids:
            try:
                cleaned_text = get_cleaned_candidate_text(candidate_id)
                final_candidate_texts.append(cleaned_text)
                list_candidate_ids.append(str(candidate_id))
            except HTTPException as e:
                print(f"[WARN] Candidate ID {candidate_id} skipped: {e.detail}")
            except Exception as e:
                print(f"[ERROR] Unexpected error with Candidate ID {candidate_id}: {e}")

        if not final_candidate_texts:
            print("[ERROR] No valid candidate data found for initial vector store.")
            return None

        print("candidate_data_len", len(final_candidate_texts))

        # Create ID mapping
        candidate_id_map = {str(i): str(cid) for i, cid in enumerate(list_candidate_ids)}

        # Generate embeddings
        candidate_embeddings = embeddings.embed_documents(final_candidate_texts)
        embedding_matrix = np.array(candidate_embeddings, dtype=np.float32)
        dimension = embedding_matrix.shape[1]

        # Build FAISS index
        index = faiss.IndexFlatIP(dimension)
        index.add(embedding_matrix)

        # Save index and map
        timestr = time.strftime("%Y%m%d-%H%M%S")
        current_index_path = os.path.join(output_dir, f"faiss_index_candidates_{timestr}")
        current_mapping_path = os.path.join(output_dir, f"candidate_id_map_{timestr}.json")

        faiss.write_index(index, current_index_path)
        with open(current_mapping_path, 'w') as f:
            json.dump(candidate_id_map, f)

        last_update_time = time.time()
        print(f"[SUCCESS] Initial vector store created: {current_index_path}")
        return index

    finally:
        conn.close()

def get_vectorstore():
    global current_index_path
    
    # Try to find existing files
    if not get_latest_files():
        # Create new vector store if none exists
        return create_initial_vectorstore()
    
    # Load mapping
    if not load_candidate_mapping():
        return create_initial_vectorstore()
    
    # Load the FAISS index
    try:
        index = faiss.read_index(current_index_path)
        return index
    except Exception as e:
        print(f"Error loading vector store: {e}")
        return create_initial_vectorstore()

def get_existing_candidate_ids_from_milvus(database_name="hirenest_recommandation", collection_name="candidate_embeddings"):
    """
    Get existing candidate IDs from Milvus collection
    
    Returns:
        set: Set of existing candidate IDs in Milvus
    """
    try:
        # Connect to Milvus
        connections.connect(
            alias="default",
            host=MILVUS_HOST,
            port=MILVUS_PORT
        )
        db.using_database(database_name)
        
        # Check if collection exists
        if not utility.has_collection(collection_name):
            print(f"[INFO] Collection '{collection_name}' does not exist yet")
            return set()
        
        # Get collection
        collection = Collection(collection_name)
        
        # Load collection to query it
        try:
            collection.load()
        except Exception as load_error:
            print(f"[WARN] Could not load collection for querying: {load_error}")
            return set()
        
        # Query to get all candidate IDs (with pagination to handle large datasets)
        try:
            all_candidate_ids = set()
            offset = 0
            limit = 16000  # Stay under Milvus limit of 16384
            
            while True:
                # Use collection.query to get candidate_id values with pagination
                results = collection.query(
                    expr="candidate_id > 0",  # Get all records where candidate_id is positive
                    output_fields=["candidate_id"],
                    limit=limit,
                    offset=offset
                )
                
                if not results:
                    break
                    
                # Add candidate IDs to the set
                batch_candidate_ids = set(str(result["candidate_id"]) for result in results)
                all_candidate_ids.update(batch_candidate_ids)
                
                # If we got fewer results than the limit, we've reached the end
                if len(results) < limit:
                    break
                    
                offset += limit
            
            print(f"[INFO] Found {len(all_candidate_ids)} existing candidates in Milvus")
            return all_candidate_ids
            
        except Exception as query_error:
            print(f"[ERROR] Failed to query existing candidate IDs from Milvus: {query_error}")
            return set()
        
    except Exception as e:
        print(f"[ERROR] Failed to connect to Milvus for candidate ID check: {e}")
        return set()

def get_new_candidates():
    conn = get_db_connection()
    
    try:
        # Get existing candidate IDs from Milvus (not from Faiss candidate_id_map)
        existing_candidate_ids = get_existing_candidate_ids_from_milvus()
        
        # Fetch all candidate IDs from the database
        query = "SELECT candidate_id FROM candidate_details"
        df = pd.read_sql_query(query, conn)
        all_candidate_ids = set(str(cid) for cid in df['candidate_id'].tolist())
        
        # Find new candidate IDs
        new_candidate_ids = all_candidate_ids - existing_candidate_ids
        
        print(f"[INFO] Total candidates in DB: {len(all_candidate_ids)}, Existing in Milvus: {len(existing_candidate_ids)}, New: {len(new_candidate_ids)}")
        
        if not new_candidate_ids:
            return []
        
        # Fetch data for new candidates
        placeholders = ', '.join(['%s'] * len(new_candidate_ids))
        query = f"SELECT candidate_id, parsed_data FROM candidate_details WHERE candidate_id IN ({placeholders})"
        df = pd.read_sql_query(query, conn, params=list(new_candidate_ids))
        
        return df.to_dict('records')
    finally:
        conn.close()

# def update_vectorstore(new_candidates):
#     global current_index_path, current_mapping_path, candidate_id_map, last_update_time
    
#     if not new_candidates:
#         return False
    
#     # Load existing index
#     index = faiss.read_index(current_index_path)
#     print("new candidates data:",new_candidates)
#     # Prepare new data
#     list_candidate_ids = [str(item['candidate_id']) for item in new_candidates]
#     list_parsed_data = [item['parsed_data'] for item in new_candidates]

#     print("parsed_data:",list_parsed_data)
#     cleaned_parsed_data = clean_list(list_parsed_data)
#     print("cleaned_parsed_data:",cleaned_parsed_data)
#     final_candidate_list = []
#     for idx, resume_json in enumerate(cleaned_parsed_data):
#         if isinstance(resume_json, dict):
#             extracted = extract_relevant_fields(resume_json)
#             final_candidate_list.append(str(extracted))
#         else:
#             final_candidate_list.append(resume_json)
#             print(f"[WARN] Resume at index {idx} skipped due to invalid data format.")

#     print("candidate_data_len",len(final_candidate_list),final_candidate_list )
    
#     # Generate embeddings for new data
#     new_embeddings = embeddings.embed_documents(final_candidate_list)
#     new_embedding_matrix = np.array(new_embeddings, dtype=np.float32)
    
#     # Add to the index
#     index.add(new_embedding_matrix)
    
#     # Update the mapping
#     start_idx = len(candidate_id_map)
#     for i, cid in enumerate(list_candidate_ids):
#         candidate_id_map[str(start_idx + i)] = cid
    
#     # Save everything with a new timestamp
#     timestr = time.strftime("%Y%m%d-%H%M%S")
#     new_index_path = os.path.join(output_dir, f"faiss_index_candidates_{timestr}")
#     new_mapping_path = os.path.join(output_dir, f"candidate_id_map_{timestr}.json")
    
#     faiss.write_index(index, new_index_path)
#     with open(new_mapping_path, 'w') as f:
#         json.dump(candidate_id_map, f)
    
#     # Update global variables
#     current_index_path = new_index_path
#     current_mapping_path = new_mapping_path
#     last_update_time = time.time()
    
#     print(f"Vector store updated with {len(new_candidates)} new candidates")
#     return True

def update_vectorstore(new_candidate_ids):
    """
    Legacy Faiss function - requires faiss to be installed
    """
    global current_index_path, current_mapping_path, candidate_id_map, last_update_time

    if not FAISS_AVAILABLE:
        print("[ERROR] Faiss not available. Use Milvus functions instead.")
        return False

    if not new_candidate_ids:
        return False

    # Load existing FAISS index
    index = faiss.read_index(current_index_path)
    print("index.ntotal:",index.ntotal)

    list_candidate_ids = []
    final_candidate_texts = []
    print("new_candidate_ids:",[candidate_id for candidate_id in new_candidate_ids])
    # breakpoint()
    for candidate in new_candidate_ids:
        
        try:
            # Fetch and clean candidate text
            cleaned_text = get_cleaned_candidate_text(candidate['candidate_id'])
            print("cleaned_text:",cleaned_text)
            
            final_candidate_texts.append(cleaned_text)
            list_candidate_ids.append(str(candidate['candidate_id']))
        except HTTPException as e:
            print(f"[WARN] Candidate ID {candidate['candidate_id']} skipped: {e.detail}")
        except Exception as e:
            print(f"[ERROR] Unexpected error with Candidate ID {candidate['candidate_id']}: {e}")

    if not final_candidate_texts:
        print("[INFO] No valid candidate data to update the vector store.")
        return False

    print("candidate_data_len", len(final_candidate_texts), final_candidate_texts)

    # Generate embeddings for the cleaned candidate texts
    new_embeddings = embeddings.embed_documents(final_candidate_texts)
    new_embedding_matrix = np.array(new_embeddings, dtype=np.float32)

    # Add new embeddings to FAISS index
    index.add(new_embedding_matrix)

    # Update candidate_id mapping
    start_idx = len(candidate_id_map)
    for i, cid in enumerate(list_candidate_ids):
        candidate_id_map[str(start_idx + i)] = cid

    # Save updated index and map
    timestr = time.strftime("%Y%m%d-%H%M%S")
    new_index_path = os.path.join(output_dir, f"faiss_index_candidates_{timestr}")
    new_mapping_path = os.path.join(output_dir, f"candidate_id_map_{timestr}.json")

    faiss.write_index(index, new_index_path)
    with open(new_mapping_path, 'w') as f:
        json.dump(candidate_id_map, f)

    # Update global references
    current_index_path = new_index_path
    current_mapping_path = new_mapping_path
    last_update_time = time.time()

    print(f"[SUCCESS] Vector store updated with {len(list_candidate_ids)} new candidates")
    return True


def get_job_description_by_id(job_id: int) -> str:
    """
    Retrieves and merges the job title and description for a given job ID.
    
    Parameters:
        job_id (int): The ID of the job to retrieve.

    Returns:
        str: A merged string of the job title and description.

    Raises:
        HTTPException: If the job ID does not exist in the database.
    """
    conn = get_db_connection()
    try:
        cursor = conn.cursor()
        cursor.execute("SELECT title, description FROM assignments WHERE id = %s", (job_id,))
        result = cursor.fetchone()

        if not result:
            raise HTTPException(status_code=404, detail=f"Job ID {job_id} not found")

        title, description = result
        return f"{title.strip()} - {description.strip()}" if title and description else title or description

    finally:
        conn.close()



def get_job_description_by_id_match(job_id: int) -> str:
    """
    Retrieves and merges the job title, description, skills, keywords, languages, and sectors for a given job ID.
    
    Parameters:
        job_id (int): The ID of the job to retrieve.

    Returns:
        str: A merged string containing the job details.

    Raises:
        HTTPException: If the job ID does not exist in the database.
    """
    conn = get_db_connection()
    try:
        cursor = conn.cursor()
        cursor.execute('''SELECT 
            a.title AS title,
            a.text_description AS description,
            GROUP_CONCAT(DISTINCT s.title) AS skill,
            GROUP_CONCAT(DISTINCT k.title) AS keyword,  
            GROUP_CONCAT(DISTINCT l.title) AS language,
            GROUP_CONCAT(DISTINCT sec.sector_name) AS sector            
        FROM 
            assignments AS a
        LEFT JOIN 
            assignment_skills AS ask ON a.id = ask.assignment_id
        LEFT JOIN 
            skills AS s ON ask.skill_id = s.id
        LEFT JOIN 
            assignment_keywords AS ak ON a.id = ak.assignment_id
        LEFT JOIN 
            keywords AS k ON ak.keyword_id = k.id
        LEFT JOIN 
            assignment_languages AS al ON a.id = al.assignment_id
        LEFT JOIN 
            languages AS l ON al.language_id = l.id
        LEFT JOIN 
            assignment_sectors AS asec ON a.id = asec.assignment_id
        LEFT JOIN 
            sectors AS sec ON asec.sector_id = sec.id
        WHERE 
            a.id = %s
        GROUP BY 
            a.id, a.title, a.text_description''', (job_id,))
        
        result = cursor.fetchone()

        if not result:
            raise HTTPException(status_code=404, detail=f"Job ID {job_id} not found")

        title = result[0] if result[0] else ""
        description = result[1] if result[1] else ""
        skill = result[2] if result[2] else ""
        keyword = result[3] if result[3] else ""
        language = result[4] if result[4] else ""
        sector = result[5] if result[5] else ""
        
        return f"{title.strip()} - {description.strip()} - Skills:{skill.strip()} - {language.strip()} - {keyword.strip()} - {sector.strip()}"

    finally:
        conn.close()

def get_candidate_details_structured(candidate_id: int):
    """
    Retrieve structured candidate details for hybrid search.
    
    Args:
        candidate_id: Candidate ID
        
    Returns:
        dict: Dictionary with individual fields and combined text
    """
    conn = get_db_connection()
    try:
        query = """
        SELECT 
            GROUP_CONCAT(DISTINCT sk.title) AS skills,
            u.headline,
            GROUP_CONCAT(DISTINCT l.title) AS languages,
            GROUP_CONCAT(DISTINCT s.sector_name) AS sectors,
            u.about,
            GROUP_CONCAT(DISTINCT k.title) AS keywords,
            GROUP_CONCAT(DISTINCT CONCAT_WS('||', up.title, up.description, up.technologies) SEPARATOR '%%') AS projects,
            GROUP_CONCAT(DISTINCT CONCAT_WS('||', uc.name, uc.authority) SEPARATOR '%%') AS certifications
        FROM 
            candidate_details cd
        LEFT JOIN users u ON cd.candidate_id = u.id
        LEFT JOIN user_sectors us ON cd.candidate_id = us.user_id
        LEFT JOIN sectors s ON us.sector_id = s.id
        LEFT JOIN user_keywords uk ON cd.candidate_id = uk.user_id
        LEFT JOIN keywords k ON uk.keyword_id = k.id
        LEFT JOIN user_skills usk ON cd.candidate_id = usk.user_id
        LEFT JOIN skills sk ON usk.skill_id = sk.id
        LEFT JOIN user_languages ul ON cd.candidate_id = ul.user_id
        LEFT JOIN languages l ON ul.language_id = l.id
        LEFT JOIN user_projects up ON cd.candidate_id = up.user_id
        LEFT JOIN user_certifications uc ON cd.candidate_id = uc.user_id
        WHERE 
            cd.candidate_id = %s
        GROUP BY 
            u.headline, u.about
        """
        
        df = pd.read_sql_query(query, conn, params=[candidate_id])
        
        if df.empty:
            return {
                "headline": "",
                "about": "",
                "skills": "",
                "sectors": "",
                "keywords": "",
                "languages": "",
                "projects": "",
                "certifications": "",
                "combined_text": ""
            }
        
        row = df.iloc[0]
        
        # Clean and prepare fields
        skills = (row['skills'] or "").strip()
        sectors = (row['sectors'] or "").strip()
        keywords = (row['keywords'] or "").strip()
        languages = (row['languages'] or "").strip()
        headline = (row['headline'] or "").strip()
        about = (row['about'] or "").strip()
        projects = (row['projects'] or "").strip()
        certifications = (row['certifications'] or "").strip()
        
        # Create combined text
        combined_parts = []
        if headline:
            combined_parts.append(headline)
        if about:
            combined_parts.append(about)
        if skills:
            combined_parts.append(skills)
        if sectors:
            combined_parts.append(sectors)
        if keywords:
            combined_parts.append(keywords)
        if languages:
            combined_parts.append(languages)
        if projects:
            combined_parts.append(projects)
        if certifications:
            combined_parts.append(certifications)
        
        combined_text = " ".join(combined_parts)
        
        return {
            "headline": headline,
            "about": about,
            "skills": skills,
            "sectors": sectors,
            "keywords": keywords,
            "languages": languages,
            "projects": projects,
            "certifications": certifications,
            "combined_text": combined_text
        }
        
    finally:
        conn.close()

def get_job_details_structured(job_id: int) -> dict:
    """
    Retrieves structured job details for field-specific hybrid search.
    
    Parameters:
        job_id (int): The ID of the job to retrieve.

    Returns:
        dict: Structured job details with separate fields.

    Raises:
        HTTPException: If the job ID does not exist in the database.
    """
    conn = get_db_connection()
    try:
        cursor = conn.cursor()
        cursor.execute('''SELECT 
            a.title AS title,
            a.text_description AS description,
            GROUP_CONCAT(DISTINCT s.title) AS skills,
            GROUP_CONCAT(DISTINCT k.title) AS keywords,  
            GROUP_CONCAT(DISTINCT l.title) AS languages,
            GROUP_CONCAT(DISTINCT sec.sector_name) AS sectors            
        FROM 
            assignments AS a
        LEFT JOIN 
            assignment_skills AS ask ON a.id = ask.assignment_id
        LEFT JOIN 
            skills AS s ON ask.skill_id = s.id
        LEFT JOIN 
            assignment_keywords AS ak ON a.id = ak.assignment_id
        LEFT JOIN 
            keywords AS k ON ak.keyword_id = k.id
        LEFT JOIN 
            assignment_languages AS al ON a.id = al.assignment_id
        LEFT JOIN 
            languages AS l ON al.language_id = l.id
        LEFT JOIN 
            assignment_sectors AS asec ON a.id = asec.assignment_id
        LEFT JOIN 
            sectors AS sec ON asec.sector_id = sec.id
        WHERE 
            a.id = %s
        GROUP BY 
            a.id, a.title, a.text_description''', (job_id,))
        
        result = cursor.fetchone()

        if not result:
            raise HTTPException(status_code=404, detail=f"Job ID {job_id} not found")

        return {
            "title": result[0] if result[0] else "",
            "description": result[1] if result[1] else "",
            "skills": result[2] if result[2] else "",
            "keywords": result[3] if result[3] else "",
            "languages": result[4] if result[4] else "",
            "sectors": result[5] if result[5] else "",
            "combined_text": f"{result[0] or ''} - {result[1] or ''} - Skills:{result[2] or ''} - {result[4] or ''} - {result[3] or ''} - {result[5] or ''}"
        }

    finally:
        conn.close()


def get_candidate_profile_by_id(candidate_id: int) -> str:
    """
    Retrieves the candidate profile data for a given candidate ID.
    
    Parameters:
        candidate_id (int): The ID of the candidate to retrieve.

    Returns:
        str: A string representation of the candidate's profile.

    Raises:
        HTTPException: If the candidate ID does not exist in the database.
    """
    conn = get_db_connection()
    try:
        cursor = conn.cursor()
        cursor.execute("SELECT id, parsed_data FROM candidate_details WHERE candidate_id = %s", (candidate_id,))
        result = cursor.fetchone()
        if not result:
            raise HTTPException(status_code=404, detail=f"Candidate ID {candidate_id} not found")

        parsed_data = result[1]
        
        # Parse the JSON data
        try:
            if parsed_data:
                parsed_json = parser.parse(parsed_data)
                extracted = extract_relevant_fields(parsed_json)
                return str(extracted)
        except ValueError as e:
            return parsed_data
            raise HTTPException(status_code=500, detail=f"Error parsing candidate data: {str(e)}")

    finally:
        conn.close()

def get_candidate_profile_by_id_matched(candidate_id: int) -> str:
    """
    Retrieves the candidate profile data for a given candidate ID.
    
    Parameters:
        candidate_id (int): The ID of the candidate to retrieve.

    Returns:
        str: A string representation of the candidate's profile.

    Raises:
        HTTPException: If the candidate ID does not exist in the database.
    """
    conn = get_db_connection()
    try:
        cursor = conn.cursor()
        cursor.execute("SELECT id, parsed_data FROM candidate_details WHERE candidate_id = %s", (candidate_id,))
        result = cursor.fetchone()
        if not result:
            raise HTTPException(status_code=404, detail=f"Candidate ID {candidate_id} not found")

        parsed_data = result[1]
        
        # Parse the JSON data
        try:
            if parsed_data:
                parsed_json = parser.parse(parsed_data)
                extracted = extract_relevant_fields(parsed_json)
                return extracted
        except ValueError as e:
            return parsed_data
            raise HTTPException(status_code=500, detail=f"Error parsing candidate data: {str(e)}")
            

    finally:
        conn.close()



def create_job_vectorstore():
    """
    Creates a vector store for job descriptions.
    
    Returns:
        faiss.Index: The FAISS index for job descriptions.
    """
    global current_index_path, current_mapping_path, job_id_map, last_update_time
    
    conn = get_db_connection()
    
    try:
        # Fetch all job data
        jobs_df = pd.read_sql_query('''SELECT 
    a.id AS job_id,
    a.title AS title,
    a.text_description AS description,
    GROUP_CONCAT(DISTINCT s.title) AS skill,
    GROUP_CONCAT(DISTINCT k.title) AS keyword,
    GROUP_CONCAT(DISTINCT l.title) AS language,
    GROUP_CONCAT(DISTINCT sec.sector_name) AS sector
FROM 
    assignment_skills AS ask
JOIN 
    assignments AS a ON ask.assignment_id = a.id
JOIN 
    skills AS s ON ask.skill_id = s.id
JOIN 
    assignment_keywords AS ak ON a.id = ak.assignment_id
JOIN 
    keywords AS k ON ak.keyword_id = k.id
JOIN 
    assignment_languages AS al ON a.id = al.assignment_id
JOIN 
    languages AS l ON al.language_id = l.id
JOIN 
    assignment_sectors AS asec ON a.id = asec.assignment_id
JOIN 
    sectors AS sec ON asec.sector_id = sec.id
GROUP BY 
    a.id, a.title, a.text_description''', conn)
        list_job_ids = jobs_df['job_id'].to_list()
        
        # Prepare job descriptions
        job_descriptions = []
        for _, row in jobs_df.iterrows():
            title = row['title'] if row['title'] else ""
            description = row['description'] if row['description'] else ""
            skill = row['skill'] if row['skill'] else ""
            language = row['language'] if row['language'] else ""
            keyword = row['keyword'] if row['keyword'] else ""
            sector = row['sector'] if row['sector'] else ""
            job_descriptions.append(f"{title.strip()} - {description.strip()} - {skill.strip()} - {language.strip()} - {keyword.strip()} - {sector.strip()}")
        print(f"Creating job vector store with {len(list_job_ids)} jobs")
        
        # Create mapping
        job_id_map = {str(i): str(jid) for i, jid in enumerate(list_job_ids)}
        
        # Generate embeddings
        job_embeddings = embeddings.embed_documents(job_descriptions)
        embedding_matrix = np.array(job_embeddings, dtype=np.float32)
        dimension = embedding_matrix.shape[1]
        
        # Create FAISS index
        index = faiss.IndexFlatIP(dimension)
        index.add(embedding_matrix)
        
        # Save everything
        timestr = time.strftime("%Y%m%d-%H%M%S")
        current_index_path = os.path.join(output_dir, f"faiss_index_jobs_{timestr}")
        current_mapping_path = os.path.join(output_dir, f"job_id_map_{timestr}.json")
        
        faiss.write_index(index, current_index_path)
        with open(current_mapping_path, 'w') as f:
            json.dump(job_id_map, f)
            
        last_update_time = time.time()
        
        print(f"Job vector store created: {current_index_path}")
        return index
    finally:
        conn.close()

def get_job_vectorstore():
    """
    Gets or creates the job vector store.
    
    Returns:
        faiss.Index: The FAISS index for job descriptions.
    """
    global current_index_path, current_mapping_path, job_id_map
    # Try to find existing files
    job_vector_files = [f for f in os.listdir(output_dir) if f.startswith("faiss_index_jobs_")]
    job_mapping_files = [f for f in os.listdir(output_dir) if f.startswith("job_id_map_")]
   
    if job_vector_files and job_mapping_files:
        latest_vector = sorted(job_vector_files)[-1]
        latest_mapping = sorted(job_mapping_files)[-1]
        current_index_path = os.path.join(output_dir, latest_vector)
        current_mapping_path = os.path.join(output_dir, latest_mapping)
        
        # Load mapping
        with open(current_mapping_path, 'r') as f:
            job_id_map = json.load(f)
        
        # Load the FAISS index
        try:
            index = faiss.read_index(current_index_path)
            return index
        except Exception as e:
            print(f"Error loading job vector store: {e}")
            return create_job_vectorstore()
    else:
        # Create new vector store if none exists
        return create_job_vectorstore()
    
def get_existing_job_ids_from_milvus(database_name="hirenest_recommandation", collection_name="job_embeddings"):
    """
    Get existing job IDs from Milvus collection
    
    Returns:
        set: Set of existing job IDs in Milvus
    """
    try:
        # Connect to Milvus
        connections.connect(
            alias="default",
            host=MILVUS_HOST,
            port=MILVUS_PORT
        )
        db.using_database(database_name)
        
        # Check if collection exists
        if not utility.has_collection(collection_name):
            print(f"[INFO] Collection '{collection_name}' does not exist yet")
            return set()
        
        # Get collection
        collection = Collection(collection_name)
        
        # Load collection to query it
        try:
            collection.load()
        except Exception as load_error:
            print(f"[WARN] Could not load collection for querying: {load_error}")
            return set()
        
        # Query to get all job IDs (with pagination to handle large datasets)
        try:
            all_job_ids = set()
            offset = 0
            limit = 16000  # Stay under Milvus limit of 16384
            
            while True:
                # Use collection.query to get job_id values with pagination
                results = collection.query(
                    expr="job_id > 0",  # Get all records where job_id is positive
                    output_fields=["job_id"],
                    limit=limit,
                    offset=offset
                )
                
                if not results:
                    break
                    
                # Add job IDs to the set
                batch_job_ids = set(str(result["job_id"]) for result in results)
                all_job_ids.update(batch_job_ids)
                
                # If we got fewer results than the limit, we've reached the end
                if len(results) < limit:
                    break
                    
                offset += limit
            
            print(f"[INFO] Found {len(all_job_ids)} existing jobs in Milvus")
            return all_job_ids
            
        except Exception as query_error:
            print(f"[ERROR] Failed to query existing job IDs from Milvus: {query_error}")
            return set()
        
    except Exception as e:
        print(f"[ERROR] Failed to connect to Milvus for job ID check: {e}")
        return set()

def get_new_jobs():
    conn = get_db_connection()
    
    try:
        # Get existing job IDs from Milvus (not from Faiss job_id_map)
        existing_job_ids = get_existing_job_ids_from_milvus()
        
        # Fetch all job IDs from the database
        query = "SELECT id FROM assignments WHERE status = 'active'"
        df = pd.read_sql_query(query, conn)
        all_job_ids = set(str(jid) for jid in df['id'].tolist())
        
        # Find new job IDs
        new_job_ids = all_job_ids - existing_job_ids
        
        print(f"[INFO] Total jobs in DB: {len(all_job_ids)}, Existing in Milvus: {len(existing_job_ids)}, New: {len(new_job_ids)}")
        
        if not new_job_ids:
            return []
        
        # Fetch data for new jobs
        placeholders = ', '.join(['%s'] * len(new_job_ids))
        query = f"SELECT id, title, text_description as description FROM assignments WHERE id IN ({placeholders}) AND status = 'active'"
        df = pd.read_sql_query(query, conn, params=list(new_job_ids))
        
        return df.to_dict('records')
    finally:
        conn.close()

def update_job_vectorstore(new_jobs):
    """
    Legacy Faiss function - kept for potential fallback but not actively used
    """
    global current_index_path, current_mapping_path, job_id_map, last_update_time
    
    if not new_jobs:
        return False
    
    # Load existing index
    index = faiss.read_index(current_index_path)
    
    # Prepare new data
    list_job_ids = [str(item.get('id', '')) for item in new_jobs]
    job_descriptions = []
    for job in new_jobs:
        # Get values with default empty string and handle None values
        title = job.get('title', '') or ''
        description = job.get('description', '') or ''
        skill = job.get('skill', '') or ''
        language = job.get('language', '') or ''
        keyword = job.get('keyword', '') or ''
        sector = job.get('sector', '') or ''
        
        # Apply strip only after ensuring the value is a string
        job_descriptions.append(f"{title.strip()} - {description.strip()} - {skill.strip()} - {language.strip()} - {keyword.strip()} - {sector.strip()}")
        
    # Generate embeddings for new data
    new_embeddings = embeddings.embed_documents(job_descriptions)
    new_embedding_matrix = np.array(new_embeddings, dtype=np.float32)
    
    # Add to the index
    index.add(new_embedding_matrix)
    
    # Update the mapping
    start_idx = len(job_id_map)
    for i, jid in enumerate(list_job_ids):
        job_id_map[str(start_idx + i)] = jid
    
    # Save everything with a new timestamp
    timestr = time.strftime("%Y%m%d-%H%M%S")
    new_index_path = os.path.join(output_dir, f"faiss_index_jobs_{timestr}")
    new_mapping_path = os.path.join(output_dir, f"job_id_map_{timestr}.json")
    
    faiss.write_index(index, new_index_path)
    with open(new_mapping_path, 'w') as f:
        json.dump(job_id_map, f)
    
    # Update global variables
    current_index_path = new_index_path
    current_mapping_path = new_mapping_path
    last_update_time = time.time()
    
    print(f"Job vector store updated with {len(new_jobs)} new jobs")
    return True

def add_candidates_to_milvus(new_candidates, database_name="hirenest_recommandation", collection_name="candidate_embeddings"):
    """
    Add new candidates to Milvus candidate collection
    
    Args:
        new_candidates: List of candidate dictionaries from get_new_candidates()
        database_name: Milvus database name
        collection_name: Milvus collection name
    
    Returns:
        bool: True if successful, False otherwise
    """
    global is_candiate_sync_running , is_request_pending
    try:
        if not new_candidates:
            print("[INFO] No new candidates to add to Milvus")
            return False
        
        print(f"[INFO] Adding {len(new_candidates)} new candidates to Milvus")
        
        # Connect to Milvus
        connections.connect(
            alias="default",
            host=MILVUS_HOST,
            port=MILVUS_PORT
        )
        db.using_database(database_name)
        
        # Check if collection exists
        if not utility.has_collection(collection_name):
            print(f"[ERROR] Collection '{collection_name}' does not exist. Please create it first.")
            return False
        
        # Get collection
        collection = Collection(collection_name)
        
        # Get database connection for fetching structured data
        conn = get_db_connection()
        
        try:
            # Extract candidate IDs from new_candidates
            candidate_ids = [str(candidate.get('candidate_id', '')) for candidate in new_candidates if candidate.get('candidate_id')]
            
            if not candidate_ids:
                print("[WARN] No valid candidate IDs found")
                return False
            
            # Fetch structured data for these candidates using the same query as sync
            placeholders = ', '.join(['%s'] * len(candidate_ids))
            query = f"""
            SELECT 
                cd.candidate_id,
                u.headline,
                u.about,
                GROUP_CONCAT(DISTINCT s.sector_name) AS sectors,
                GROUP_CONCAT(DISTINCT k.title) AS keywords,
                GROUP_CONCAT(DISTINCT sk.title) AS skills,
                GROUP_CONCAT(DISTINCT l.title) AS languages,
                GROUP_CONCAT(DISTINCT CONCAT_WS('||', up.title, up.description, up.technologies) SEPARATOR '%%') AS projects,
                GROUP_CONCAT(DISTINCT CONCAT_WS('||', uc.name, uc.authority) SEPARATOR '%%') AS certifications
            FROM 
                candidate_details cd
            LEFT JOIN users u ON cd.candidate_id = u.id
            LEFT JOIN user_sectors us ON cd.candidate_id = us.user_id
            LEFT JOIN sectors s ON us.sector_id = s.id
            LEFT JOIN user_keywords uk ON cd.candidate_id = uk.user_id
            LEFT JOIN keywords k ON uk.keyword_id = k.id
            LEFT JOIN user_skills usk ON cd.candidate_id = usk.user_id
            LEFT JOIN skills sk ON usk.skill_id = sk.id
            LEFT JOIN user_languages ul ON cd.candidate_id = ul.user_id
            LEFT JOIN languages l ON ul.language_id = l.id
            LEFT JOIN user_projects up ON cd.candidate_id = up.user_id
            LEFT JOIN user_certifications uc ON cd.candidate_id = uc.user_id
            WHERE cd.candidate_id IN ({placeholders})
            GROUP BY 
                cd.candidate_id, u.headline, u.about


            """
            
            df = pd.read_sql_query(query, conn, params=candidate_ids)
            
            if df.empty:
                print("[WARN] No structured data found for new candidates")
                return False
            
            # Prepare candidate data for Milvus insertion with structured fields
            candidate_data_for_milvus = []
            
            for _, row in df.iterrows():
                try:
                    candidate_id = int(row['candidate_id'])
                    
                    # Prepare structured data for Milvus (each field separately)
                    candidate_data = {
                        "candidate_id": candidate_id,
                        "headline": str(row['headline'] or ''),
                        "about": str(row['about'] or ''),
                        "sectors": str(row['sectors'] or ''),
                        "keywords": str(row['keywords'] or ''),
                        "skills": str(row['skills'] or ''),
                        "languages": str(row['languages'] or ''),
                        "projects": str(row['projects'] or ''),
                        "certifications": str(row['certifications'] or '')
                    }
                    
                    # Create combined text for embedding (same as sync approach)
                    text_parts = [str(row[col]) for col in df.columns if col != 'candidate_id' and pd.notnull(row[col])]
                    combined_text = " ".join(text_parts)
                    cleaned_text = clean_resume_text(combined_text)
                    
                    # Generate embedding for the combined text
                    embedding = embeddings.embed_query(cleaned_text)
                    candidate_data["embedding"] = embedding
                    
                    candidate_data_for_milvus.append(candidate_data)
                    print(f"[INFO] Prepared candidate {candidate_id} for Milvus insertion with structured data")
                    
                except Exception as e:
                    print(f"[ERROR] Failed to process candidate {row['candidate_id']}: {e}")
                    continue
                    
        finally:
            is_candiate_sync_running = False
            conn.close()
        
        if not candidate_data_for_milvus:
            print("[WARN] No valid candidates to insert into Milvus")
            is_candiate_sync_running = False
            return False
        
        # Insert data into Milvus
        try:
            insert_result = collection.insert(candidate_data_for_milvus)
            collection.flush()
            print(f"[INFO] Successfully inserted {len(candidate_data_for_milvus)} candidates into Milvus")
            print(f"[INFO] Insertion result: {insert_result}")
            is_candiate_sync_running = False
          
            # Ensure collection is loaded for immediate search availability
            try:
                # collection.load()
                # print(f"[INFO] Collection '{collection_name}' loaded for search")
                if is_request_pending:
                    is_request_pending = False
                    request = UpdateRequest(force_full_rebuild=False)
                    background_tasks = BackgroundTasks()
                    asyncio.run(update_candidate_vectorstore_endpoint(request, background_tasks))

            except Exception as load_error:
                is_candiate_sync_running = False
                print(f"[WARN] Failed to load collection after insertion: {load_error}")

            is_candiate_sync_running = False
            return True
            
        except Exception as insert_error:
            is_candiate_sync_running = False
            print(f"[ERROR] Failed to insert candidates into Milvus: {insert_error}")
            return False

        
    except Exception as e:
        is_candiate_sync_running = False
        print(f"[ERROR] Failed to add candidates to Milvus: {e}")
        return False

def add_jobs_to_milvus(new_jobs, database_name="hirenest_recommandation", collection_name="job_embeddings"):
    """
    Add new jobs to Milvus job collection
    
    Args:
        new_jobs: List of job dictionaries from get_new_jobs()
        database_name: Milvus database name
        collection_name: Milvus collection name
    
    Returns:
        bool: True if successful, False otherwise
    """
    try:
        if not new_jobs:
            print("[INFO] No new jobs to add to Milvus")
            return False
        
        print(f"[INFO] Adding {len(new_jobs)} new jobs to Milvus")
        
        # Connect to Milvus
        connections.connect(
            alias="default",
            host=MILVUS_HOST,
            port=MILVUS_PORT
        )
        db.using_database(database_name)
        
        # Check if collection exists
        if not utility.has_collection(collection_name):
            print(f"[ERROR] Collection '{collection_name}' does not exist. Please create it first.")
            return False
        
        # Get collection
        collection = Collection(collection_name)
        
        # Prepare job data for Milvus insertion
        job_data_for_milvus = []
        for job in new_jobs:
            try:
                job_id = job.get('id', '')
                if not job_id:
                    print(f"[WARN] Skipping job with missing ID: {job}")
                    continue
                
                # Get structured job details
                job_details = get_job_details_structured(job_id=job_id)
                
                # Generate embedding for the combined text
                embedding = embeddings.embed_query(job_details["combined_text"])
                
                # Prepare data according to job schema
                job_record = {
                    "job_id": job_id,
                    "title": job_details.get("title", ""),
                    "description": job_details.get("description", ""),
                    "skills": job_details.get("skills", ""),
                    "keywords": job_details.get("keywords", ""),
                    "languages": job_details.get("languages", ""),
                    "sectors": job_details.get("sectors", ""),
                    "embedding": embedding
                }
                
                job_data_for_milvus.append(job_record)
                print(f"[INFO] Prepared job {job_id} for Milvus insertion")
                
            except Exception as e:
                print(f"[ERROR] Failed to process job {job.get('id', 'unknown')}: {e}")
                continue
        
        if not job_data_for_milvus:
            print("[WARN] No valid jobs to insert into Milvus")
            return False
        
        # Insert data into Milvus
        try:
            insert_result = collection.insert(job_data_for_milvus)
            collection.flush()
            print(f"[INFO] Successfully inserted {len(job_data_for_milvus)} jobs into Milvus")
            print(f"[INFO] Insertion result: {insert_result}")
            
            # Ensure collection is loaded for immediate search availability
            try:
                collection.load()
                print(f"[INFO] Collection '{collection_name}' loaded for search")
            except Exception as load_error:
                print(f"[WARN] Failed to load collection after insertion: {load_error}")
            
            return True
            
        except Exception as insert_error:
            print(f"[ERROR] Failed to insert jobs into Milvus: {insert_error}")
            return False
        
    except Exception as e:
        print(f"[ERROR] Failed to add jobs to Milvus: {e}")
        return False


class JobQuery(BaseModel):
    job_id: int
    top_k: int = 10
    threshold: float = 0.70

class CandidateCountJobQuery(BaseModel):
    job_ids: List[int]
    top_k: int
    threshold: float = 0.70

class CandidateQuery(BaseModel):
    candidate_id: int
    top_k: int = 10
    threshold: float = 0.70

class UpdateRequest(BaseModel):
    force_full_rebuild: bool = False

class UpdateJobRequest(BaseModel):
    force_full_rebuild: bool = False

# @app.post("/recommend_candidates/")
# async def recommend_candidates(query: JobQuery):
#     try:
#         execution_start_time = time.time()
#         # Get structured job details for hybrid search
#         job_details = get_job_details_structured(job_id=query.job_id)
#         job_description = job_details["combined_text"]
#         print(f"Job description: {job_description}")
#         print(f"Job skills: {job_details['skills']}")
#         print(f"Job sectors: {job_details['sectors']}")
#         print(f"Job keywords: {job_details['keywords']}")
        
#         # Check if job has any filterable requirements
#         job_has_requirements = bool(job_details['skills'] or job_details['sectors'] or job_details['keywords'])
#         print(f"Job has requirements: {job_has_requirements}")
        
#         # Connect to Milvus
#         try:
#             connections.connect(
#                 alias="default",
#                 host=MILVUS_HOST,
#                 port=MILVUS_PORT
#             )
#             # Switch to the database
#             db.using_database("hirenest_recommandation")
#         except Exception as e:
#             raise HTTPException(status_code=500, detail=f"Failed to connect to Milvus: {str(e)}")
        
#         # Check if collection exists
#         collection_name = "candidate_embeddings"
#         if not utility.has_collection(collection_name):
#             raise HTTPException(status_code=404, detail=f"Collection '{collection_name}' not found. Please sync data first.")
        
#         # Generate embedding for the job description
#         query_embedding = embeddings.embed_query(job_description)
        
#         # Prepare search parameters for hybrid search
#         search_params = {
#             "metric_type": "IP",  # Inner Product (same as Faiss IndexFlatIP)
#             "params": {
#                 "nprobe": 128  # Number of probes for IVF_FLAT search
#             }
#         }
        
#         # Get collection and handle loading
#         collection = Collection(collection_name)
        
#         # Ensure collection has an index and is loaded
#         try:
#             # Create index if it doesn't exist (idempotent operation)
#             try:
#                 index_params = {
#                     "metric_type": "IP",
#                     "index_type": "IVF_FLAT",
#                     "params": {
#                         "nlist": 1024
#                     }
#                 }
#                 collection.create_index(field_name="embedding", index_params=index_params)
#                 print(f"[INFO] Index ensured for collection '{collection_name}'")
#             except Exception as idx_error:
#                 print(f"[INFO] Index exists or error: {idx_error}")
            
#             # Load the collection
#             collection.load()
#             print(f"[INFO] Collection '{collection_name}' loaded successfully")
            
#         except Exception as load_error:
#             print(f"[ERROR] Failed to prepare collection: {load_error}")
#             raise HTTPException(status_code=500, detail=f"Failed to prepare collection: {str(load_error)}")
        
#         # Create search expression for field-specific filtering using Milvus native capabilities
#         search_expressions = []
        
#         # Build filter expressions for job requirements
#         if job_details['skills']:
#             job_skills = [s.strip() for s in job_details['skills'].split(',') if s.strip()]
#             if job_skills:
#                 skills_conditions = []
#                 for skill in job_skills:
#                     # Add both lowercase and original case for case-insensitive matching
#                     skills_conditions.append(f"skills like '%{skill.lower()}%'")
#                     if skill.lower() != skill:  # Add original case if different
#                         skills_conditions.append(f"skills like '%{skill}%'")
#                 if skills_conditions:
#                     search_expressions.append(f"({' OR '.join(skills_conditions)})")
        
#         # if job_details['sectors']:
#         #     job_sectors = [s.strip() for s in job_details['sectors'].split(',') if s.strip()]
#         #     if job_sectors:
#         #         sectors_conditions = []
#         #         for sector in job_sectors:
#         #             # Add both lowercase and original case for case-insensitive matching
#         #             sectors_conditions.append(f"sectors like '%{sector.lower()}%'")
#         #             if sector.lower() != sector:  # Add original case if different
#         #                 sectors_conditions.append(f"sectors like '%{sector}%'")
#         #         if sectors_conditions:
#         #             search_expressions.append(f"({' OR '.join(sectors_conditions)})")
                    
#         if job_details['keywords']:
#             job_keywords = [k.strip() for k in job_details['keywords'].split(',') if k.strip()]
#             if job_keywords:
#                 keywords_conditions = []
#                 for keyword in job_keywords:
#                     # Add both lowercase and original case for case-insensitive matching
#                     keywords_conditions.append(f"keywords like '%{keyword.lower()}%'")
#                     if keyword.lower() != keyword:  # Add original case if different
#                         keywords_conditions.append(f"keywords like '%{keyword}%'")
#                 if keywords_conditions:
#                     search_expressions.append(f"({' OR '.join(keywords_conditions)})")
        
#         print(f"Search expressions: {search_expressions}")
        
#         # Use Milvus native hybrid search with strict field filtering
#         try:
#             from pymilvus import AnnSearchRequest, RRFRanker
            
#             search_requests = []
            
#             # Only use field-filtered search requests to ensure matching candidates
#             if search_expressions:
#                 # Create separate search requests for each field type for better control
#                 search_limit = min(query.top_k * 3, 300)
                
#                 # Skills-focused search
#                 if any('skills like' in expr for expr in search_expressions):
#                     skills_filter = next((expr for expr in search_expressions if 'skills like' in expr), None)
#                     if skills_filter:
#                         skills_request = AnnSearchRequest(
#                             data=[query_embedding],
#                             anns_field="embedding",
#                             param=search_params,
#                             limit=search_limit,
#                             expr=skills_filter
#                         )
#                         search_requests.append(skills_request)
                
#                 # Sectors-focused search  
#                 # if any('sectors like' in expr for expr in search_expressions):
#                 #     sectors_filter = next((expr for expr in search_expressions if 'sectors like' in expr), None)
#                 #     if sectors_filter:
#                 #         sectors_request = AnnSearchRequest(
#                 #             data=[query_embedding],
#                 #             anns_field="embedding",
#                 #             param=search_params,
#                 #             limit=search_limit,
#                 #             expr=sectors_filter
#                 #         )
#                 #         search_requests.append(sectors_request)
                
#                 # Keywords-focused search
#                 if any('keywords like' in expr for expr in search_expressions):
#                     keywords_filter = next((expr for expr in search_expressions if 'keywords like' in expr), None)
#                     if keywords_filter:
#                         keywords_request = AnnSearchRequest(
#                             data=[query_embedding],
#                             anns_field="embedding", 
#                             param=search_params,
#                             limit=search_limit,
#                             expr=keywords_filter
#                         )
#                         search_requests.append(keywords_request)
                
#                 # If we have multiple field types, also add a combined filter for candidates matching multiple criteria
#                 if len(search_expressions) > 1:
#                     combined_filter = " OR ".join(search_expressions)
#                     combined_request = AnnSearchRequest(
#                         data=[query_embedding],
#                         anns_field="embedding",
#                         param=search_params,
#                         limit=search_limit,
#                         expr=combined_filter
#                     )
#                     search_requests.append(combined_request)
                
#             # If no field conditions exist, fall back to semantic search but with strict post-filtering
#             if not search_requests:
#                 if job_has_requirements:
#                     print("[STRICT_MODE] Job has requirements but no valid search expressions - will filter results strictly")
#                 else:
#                     print("[WARNING] No field filters available, using semantic search only")
                
#                 semantic_request = AnnSearchRequest(
#                     data=[query_embedding],
#                     anns_field="embedding",
#                     param=search_params,
#                     limit=min(query.top_k * 3, 300),
#                     expr=None
#                 )
#                 search_requests = [semantic_request]
            
#             # Perform native Milvus hybrid search with strict filtering
#             search_results = collection.hybrid_search(
#                 reqs=search_requests,
#                 rerank=RRFRanker(),  # Use Reciprocal Rank Fusion for combining results
#                 limit=query.top_k * 2,  # Get more results for final processing
#                 output_fields=["candidate_id", "headline", "about", "skills", "sectors", "keywords", "languages", "projects", "certifications"]
#             )
            
#         except Exception as hybrid_error:
#             print(f"[INFO] Hybrid search failed, falling back to simple vector search: {hybrid_error}")
            
#             # Fallback to simple vector search with field filtering if available
#             fallback_filter = None
#             if search_expressions:
#                 fallback_filter = " OR ".join(search_expressions)
#                 print(f"[INFO] Using fallback filter: {fallback_filter}")
            
#             search_results = collection.search(
#                 data=[query_embedding],
#                 anns_field="embedding",
#                 param=search_params,
#                 limit=min(query.top_k * 3, 300),
#                 expr=fallback_filter,  # Apply field filtering in fallback too
#                 output_fields=["candidate_id", "headline", "about", "skills", "sectors", "keywords", "languages", "projects", "certifications"]
#             )
        
#         if not search_results or not search_results[0]:
#             return {
#                 "status": "No Results",
#                 "message": "No candidates found in the vector database",
#                 "count": 0,
#                 "recommendations": []
#             }
#         recommendations = []
     
        
#         for hit in search_results[0]:
#             try:
#                 candidate_id = hit.entity.get("candidate_id")
#                 embedding_score = float(hit.score)  # Vector similarity score
#                 # Get candidate data from Milvus result
#                 candidate_data = {
#                     "headline": hit.entity.get("headline", ""),
#                     "about": hit.entity.get("about", ""),
#                     "skills": hit.entity.get("skills", ""),
#                     "sectors": hit.entity.get("sectors", ""),
#                     "keywords": hit.entity.get("keywords", ""),
#                     "languages": hit.entity.get("languages", ""),
#                     "projects": hit.entity.get("projects", ""),
#                     "certifications": hit.entity.get("certifications", "")
#                 }
                
#                 # Create combined candidate text for hybrid scoring
#                 candidate_text_parts = []
#                 for field, value in candidate_data.items():
#                     if value and str(value).strip():
#                         candidate_text_parts.append(str(value))
                
#                 candidate_combined_text = " ".join(candidate_text_parts)
                
#                 # Milvus hybrid search already provides RRF-ranked results
#                 # Add supplementary scoring for transparency and additional insights
                
#                 # 1. Use Milvus RRF score as primary (already considers vector + field matching)
#                 milvus_hybrid_score = embedding_score  # This is the RRF score from Milvus
                
#                 prompt = base_prompt()
                
#                 # data  = get_groq_llm(system_message=prompt,resume_text=candidate_combined_text,job_description=job_description,api_key=Groq_API_KEY)
#                 data  = matched_score_llm(system_message=prompt,resume_text=candidate_combined_text,job_description=job_description,api_key=OPENAI_API_KEY)
                
#                 print("data:",data)
#                 parsed_json = safe_parse_json(data)
#                 # Response orchestration
#                 response = {
#                     "job_id": query.job_id,
#                     "candidate_id": candidate_id,
#                     "similarity_score": parsed_json.get("jobFitAnalysis").get("overallMatchScore"),
#                     "data": parsed_json
#                 }             
#                 recommendations.append(response)
                
#             except Exception as e:
#                 print(f"Error processing candidate {hit.entity.get('candidate_id', 'unknown')}: {e}")
#                 continue
        
#         # Sort by hybrid score (descending)
#         # recommendations.sort(key=lambda x: x["similarity_score"], reverse=True)
        
#         # Limit to requested top_k
#         # recommendations = recommendations[:query.top_k]
#         execution_end_time = time.time()
#         execution_time = execution_end_time - execution_start_time
#         return {
#             "status": "Success",
#             "recommendations": recommendations,
#             "execution_time": execution_time
#         }
        
            
#     except Exception as e:
#         raise HTTPException(status_code=500, detail=f"Error processing request: {str(e)}")
@app.post("/total_candidate_count/")
async def total_candidate_count(query: CandidateCountJobQuery):

    if not query.job_ids:
        return {
            "success":"False",
            "message":"Input list cannot be empty. Please provide valid values."
        }
    # Connect to Milvus
    try:
        connections.connect(
            alias="default",
            host=MILVUS_HOST,
            port=MILVUS_PORT
        )
        # Switch to the database
        db.using_database("hirenest_recommandation")
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Failed to connect to Milvus: {str(e)}")
    # Prepare search parameters for hybrid search
    search_params = {
        "metric_type": "IP",  # Inner Product (same as Faiss IndexFlatIP)
        "params": {
            "nprobe": 128  # Number of probes for IVF_FLAT search
        }
    }
    index_params = {
                        "metric_type": "IP",
                        "index_type": "IVF_FLAT",
                        "params": {
                            "nlist": 1024
                        }
    }
    final_result = []
    for id in query.job_ids:
        try:
            # Get structured job details for hybrid search
            job_details = get_job_details_structured(job_id=id)
            job_description = job_details["combined_text"]
            print(f"Job description: {job_description}")
            print(f"Job skills: {job_details['skills']}")
            print(f"Job sectors: {job_details['sectors']}")
            print(f"Job keywords: {job_details['keywords']}")
            
            # Check if job has any filterable requirements
            job_has_requirements = bool(job_details['skills'] or job_details['sectors'] or job_details['keywords'])
            # print(f"Job has requirements: {job_has_requirements}")
            
            
            
            # Check if collection exists
            collection_name = CANDIDATE_RECOMMANDATION_COLLECTION_NAME
            if not utility.has_collection(collection_name):
                raise HTTPException(status_code=404, detail=f"Collection '{collection_name}' not found. Please sync data first.")
            
            # Generate embedding for the job description
            query_embedding = embeddings.embed_query(job_description)
            
            
            
            # Get collection and handle loading
            collection = Collection(collection_name)
            
            # Ensure collection has an index and is loaded
            try:
                # Create index if it doesn't exist (idempotent operation)
                try:
                    if not collection.has_index():
                        collection.create_index(field_name="embedding", index_params=index_params)
                    print(f"[INFO] Index ensured for collection '{collection_name}'")
                except Exception as idx_error:
                    print(f"[INFO] Index exists or error: {idx_error}")
                
                # Load the collection
                collection.load()
                print(f"[INFO] Collection '{collection_name}' loaded successfully")
                
            except Exception as load_error:
                print(f"[ERROR] Failed to prepare collection: {load_error}")
                raise HTTPException(status_code=500, detail=f"Failed to prepare collection: {str(load_error)}")
            
            # Create search expression for field-specific filtering using Milvus native capabilities
            search_expressions = []
            project_technology_terms = []
            # Build filter expressions for job requirements
            if job_details['skills']:
                job_skills = [s.strip() for s in job_details['skills'].split(',') if s.strip()]
                project_technology_terms.extend(job_skills)
                if job_skills:
                    skills_conditions = []
                    for skill in job_skills:
                        # Add both lowercase and original case for case-insensitive matching
                        skills_conditions.append(f"skills like '%{skill.lower()}%'")
                        if skill.lower() != skill:  # Add original case if different
                            skills_conditions.append(f"skills like '%{skill}%'")
                    if skills_conditions:
                        search_expressions.append(f"({' OR '.join(skills_conditions)})")
            
            # if job_details['sectors']:
            #     job_sectors = [s.strip() for s in job_details['sectors'].split(',') if s.strip()]
            #     if job_sectors:
            #         sectors_conditions = []
            #         for sector in job_sectors:
            #             # Add both lowercase and original case for case-insensitive matching
            #             sectors_conditions.append(f"sectors like '%{sector.lower()}%'")
            #             if sector.lower() != sector:  # Add original case if different
            #                 sectors_conditions.append(f"sectors like '%{sector}%'")
            #         if sectors_conditions:
            #             search_expressions.append(f"({' OR '.join(sectors_conditions)})")
                        
            if job_details['keywords']:
                job_keywords = [k.strip() for k in job_details['keywords'].split(',') if k.strip()]
                project_technology_terms.extend(job_keywords)
                if job_keywords:
                    keywords_conditions = []
                    for keyword in job_keywords:
                        # Add both lowercase and original case for case-insensitive matching
                        keywords_conditions.append(f"keywords like '%{keyword.lower()}%'")
                        if keyword.lower() != keyword:  # Add original case if different
                            keywords_conditions.append(f"keywords like '%{keyword}%'")
                    if keywords_conditions:
                        search_expressions.append(f"({' OR '.join(keywords_conditions)})")

            if project_technology_terms:
                project_conditions = []
                for term in project_technology_terms:
                    project_conditions.append(f"projects like '%{term.lower()}%'")
                    if term.lower() != term:
                        project_conditions.append(f"projects like '%{term}%'")
                search_expressions.append(f"({' OR '.join(project_conditions)})")

            print(f"Search expressions: {search_expressions}")
            
            # Use Milvus native hybrid search with strict field filtering
            try:
                search_requests = []
                technology_filter = []
                
                # Only use field-filtered search requests to ensure matching candidates
                if search_expressions:
                    # Create separate search requests for each field type for better control
                    search_limit = query.top_k 
                    
                    # Skills-focused search
                    if any('skills like' in expr for expr in search_expressions):
                        skills_filter = next((expr for expr in search_expressions if 'skills like' in expr), None)
                        if skills_filter:
                            technology_filter.append(skills_filter)
                            skills_request = AnnSearchRequest(
                                data=[query_embedding],
                                anns_field="embedding",
                                param=search_params,
                                limit=search_limit,
                                expr=skills_filter
                            )
                            search_requests.append(skills_request)
                    
                    # Keywords-focused search
                    if any('keywords like' in expr for expr in search_expressions):
                        keywords_filter = next((expr for expr in search_expressions if 'keywords like' in expr), None)
                        if keywords_filter:
                            technology_filter.append(keywords_filter)
                            keywords_request = AnnSearchRequest(
                                data=[query_embedding],
                                anns_field="embedding", 
                                param=search_params,
                                limit=search_limit,
                                expr=keywords_filter
                            )
                            search_requests.append(keywords_request)

                    if technology_filter:
                        technology_filter = " OR ".join(technology_filter)
                        technology_request = AnnSearchRequest(
                            data=[query_embedding],
                            anns_field="embedding",
                            param=search_params,
                            limit=search_limit,
                            expr=technology_filter
                        )
                        search_requests.append(technology_request)

                    # If we have multiple field types, also add a combined filter for candidates matching multiple criteria
                    if len(search_expressions) > 1:
                        combined_filter = " OR ".join(search_expressions)
                        combined_request = AnnSearchRequest(
                            data=[query_embedding],
                            anns_field="embedding",
                            param=search_params,
                            limit=search_limit,
                            expr=combined_filter
                        )
                        search_requests.append(combined_request)
                    
                # If no field conditions exist, fall back to semantic search but with strict post-filtering
                if not search_requests:
                    if job_has_requirements:
                        print("[STRICT_MODE] Job has requirements but no valid search expressions - will filter results strictly")
                    else:
                        print("[WARNING] No field filters available, using semantic search only")
                    
                    semantic_request = AnnSearchRequest(
                        data=[query_embedding],
                        anns_field="embedding",
                        param=search_params,
                        limit=query.top_k ,
                        expr=None
                    )
                    search_requests = [semantic_request]
                
                # Perform native Milvus hybrid search with strict filtering
                search_results = collection.hybrid_search(
                    reqs=search_requests,
                    rerank=RRFRanker(),  # Use Reciprocal Rank Fusion for combining results
                    limit=query.top_k,  # Get more results for final processing
                    output_fields=["candidate_id", "headline", "about", "skills", "sectors", "keywords", "languages", "projects", "certifications"]
                )

                
            except Exception as hybrid_error:
                print(f"[INFO] Hybrid search failed, falling back to simple vector search: {hybrid_error}")
                
                # Fallback to simple vector search with field filtering if available
                fallback_filter = None
                if search_expressions:
                    fallback_filter = " OR ".join(search_expressions)
                    print(f"[INFO] Using fallback filter: {fallback_filter}")
                
                search_results = collection.search(
                    data=[query_embedding],
                    anns_field="embedding",
                    param=search_params,
                    limit=query.top_k,
                    expr=fallback_filter,  # Apply field filtering in fallback too
                    output_fields=["candidate_id", "headline", "about", "skills", "sectors", "keywords", "languages", "projects", "certifications"]
                )
            print("search_result:",search_results)
            if not search_results or not search_results[0]:
                result = {
                    "job_id": id,
                    "count": 0,
                }
            else:
                result = {
                    "job_id": id,
                    "count": len(search_results[0]),
                    }
            final_result.append(result)
        
    
        except Exception as e:
            result = {
                    "job_id": id,
                    "count": 0,
                    }
            final_result.append(result)

    return final_result


@app.post("/recommend_candidates/")
async def recommend_candidates_v2(query: JobQuery,  background_tasks: BackgroundTasks):
    try:
        # Get structured job details for hybrid search
        job_details = get_job_details_structured(job_id=query.job_id)
        job_description = job_details["combined_text"]
        print(f"Job description: {job_description}")
        print(f"Job skills: {job_details['skills']}")
        print(f"Job sectors: {job_details['sectors']}")
        print(f"Job keywords: {job_details['keywords']}")
        
        # Check if job has any filterable requirements
        job_has_requirements = bool(job_details['skills'] or job_details['sectors'] or job_details['keywords'])
        # print(f"Job has requirements: {job_has_requirements}")
        
        # Connect to Milvus
        try:
            connections.connect(
                alias="default",
                host=MILVUS_HOST,
                port=MILVUS_PORT
            )
            # Switch to the database
            db.using_database("hirenest_recommandation")
        except Exception as e:
            raise HTTPException(status_code=500, detail=f"Failed to connect to Milvus: {str(e)}")
        
        # Check if collection exists
        collection_name = CANDIDATE_RECOMMANDATION_COLLECTION_NAME
        if not utility.has_collection(collection_name):
            raise HTTPException(status_code=404, detail=f"Collection '{collection_name}' not found. Please sync data first.")
        
        # Generate embedding for the job description
        query_embedding = embeddings.embed_query(job_description)
        
        # Prepare search parameters for hybrid search
        search_params = {
            "metric_type": "IP",  # Inner Product (same as Faiss IndexFlatIP)
            "params": {
                "nprobe": 128  # Number of probes for IVF_FLAT search
            }
        }
        
        # Get collection and handle loading
        collection = Collection(collection_name)
        
        # Ensure collection has an index and is loaded
        try:
            # Create index if it doesn't exist (idempotent operation)
            try:
                index_params = {
                    "metric_type": "IP",
                    "index_type": "IVF_FLAT",
                    "params": {
                        "nlist": 1024
                    }
                }
                if not collection.has_index():
                    collection.create_index(field_name="embedding", index_params=index_params)
                print(f"[INFO] Index ensured for collection '{collection_name}'")
            except Exception as idx_error:
                print(f"[INFO] Index exists or error: {idx_error}")
            
            # Load the collection
            collection.load()
            print(f"[INFO] Collection '{collection_name}' loaded successfully")
            
        except Exception as load_error:
            print(f"[ERROR] Failed to prepare collection: {load_error}")
            raise HTTPException(status_code=500, detail=f"Failed to prepare collection: {str(load_error)}")
        
        # Create search expression for field-specific filtering using Milvus native capabilities
        search_expressions = []
        project_technology_terms = []
        # Build filter expressions for job requirements
        if job_details['skills']:
            job_skills = [s.strip() for s in job_details['skills'].split(',') if s.strip()]
            project_technology_terms.extend(job_skills)
            if job_skills:
                skills_conditions = []
                for skill in job_skills:
                    # Add both lowercase and original case for case-insensitive matching
                    skills_conditions.append(f"skills like '%{skill.lower()}%'")
                    if skill.lower() != skill:  # Add original case if different
                        skills_conditions.append(f"skills like '%{skill}%'")
                if skills_conditions:
                    search_expressions.append(f"({' OR '.join(skills_conditions)})")
        
        # if job_details['sectors']:
        #     job_sectors = [s.strip() for s in job_details['sectors'].split(',') if s.strip()]
        #     if job_sectors:
        #         sectors_conditions = []
        #         for sector in job_sectors:
        #             # Add both lowercase and original case for case-insensitive matching
        #             sectors_conditions.append(f"sectors like '%{sector.lower()}%'")
        #             if sector.lower() != sector:  # Add original case if different
        #                 sectors_conditions.append(f"sectors like '%{sector}%'")
        #         if sectors_conditions:
        #             search_expressions.append(f"({' OR '.join(sectors_conditions)})")
                    
        if job_details['keywords']:
            job_keywords = [k.strip() for k in job_details['keywords'].split(',') if k.strip()]
            project_technology_terms.extend(job_keywords)
            if job_keywords:
                keywords_conditions = []
                for keyword in job_keywords:
                    # Add both lowercase and original case for case-insensitive matching
                    keywords_conditions.append(f"keywords like '%{keyword.lower()}%'")
                    if keyword.lower() != keyword:  # Add original case if different
                        keywords_conditions.append(f"keywords like '%{keyword}%'")
                if keywords_conditions:
                    search_expressions.append(f"({' OR '.join(keywords_conditions)})")

        if project_technology_terms:
            project_conditions = []
            for term in project_technology_terms:
                project_conditions.append(f"projects like '%{term.lower()}%'")
                if term.lower() != term:
                    project_conditions.append(f"projects like '%{term}%'")
            search_expressions.append(f"({' OR '.join(project_conditions)})")

        print(f"Search expressions: {search_expressions}")
        
        # Use Milvus native hybrid search with strict field filtering
        try:
            search_requests = []
            technology_filter = []
            
            # Only use field-filtered search requests to ensure matching candidates
            if search_expressions:
                # Create separate search requests for each field type for better control
                search_limit = query.top_k 
                
                # Skills-focused search
                if any('skills like' in expr for expr in search_expressions):
                    skills_filter = next((expr for expr in search_expressions if 'skills like' in expr), None)
                    if skills_filter:
                        technology_filter.append(skills_filter)
                        skills_request = AnnSearchRequest(
                            data=[query_embedding],
                            anns_field="embedding",
                            param=search_params,
                            limit=search_limit,
                            expr=skills_filter
                        )
                        search_requests.append(skills_request)
                
                # Sectors-focused search  
                # if any('sectors like' in expr for expr in search_expressions):
                #     sectors_filter = next((expr for expr in search_expressions if 'sectors like' in expr), None)
                #     if sectors_filter:
                #         sectors_request = AnnSearchRequest(
                #             data=[query_embedding],
                #             anns_field="embedding",
                #             param=search_params,
                #             limit=search_limit,
                #             expr=sectors_filter
                #         )
                #         search_requests.append(sectors_request)
                
                # Keywords-focused search
                if any('keywords like' in expr for expr in search_expressions):
                    keywords_filter = next((expr for expr in search_expressions if 'keywords like' in expr), None)
                    if keywords_filter:
                        technology_filter.append(keywords_filter)
                        keywords_request = AnnSearchRequest(
                            data=[query_embedding],
                            anns_field="embedding", 
                            param=search_params,
                            limit=search_limit,
                            expr=keywords_filter
                        )
                        search_requests.append(keywords_request)

                if technology_filter:
                    technology_filter = " OR ".join(technology_filter)
                    technology_request = AnnSearchRequest(
                        data=[query_embedding],
                        anns_field="embedding",
                        param=search_params,
                        limit=search_limit,
                        expr=technology_filter
                    )
                    search_requests.append(technology_request)

                # If we have multiple field types, also add a combined filter for candidates matching multiple criteria
                if len(search_expressions) > 1:
                    combined_filter = " OR ".join(search_expressions)
                    combined_request = AnnSearchRequest(
                        data=[query_embedding],
                        anns_field="embedding",
                        param=search_params,
                        limit=search_limit,
                        expr=combined_filter
                    )
                    search_requests.append(combined_request)
                
            # If no field conditions exist, fall back to semantic search but with strict post-filtering
            if not search_requests:
                if job_has_requirements:
                    print("[STRICT_MODE] Job has requirements but no valid search expressions - will filter results strictly")
                else:
                    print("[WARNING] No field filters available, using semantic search only")
                
                semantic_request = AnnSearchRequest(
                    data=[query_embedding],
                    anns_field="embedding",
                    param=search_params,
                    limit=query.top_k ,
                    expr=None
                )
                search_requests = [semantic_request]
            
            # Perform native Milvus hybrid search with strict filtering
            search_results = collection.hybrid_search(
                reqs=search_requests,
                rerank=RRFRanker(),  # Use Reciprocal Rank Fusion for combining results
                limit=query.top_k,  # Get more results for final processing
                output_fields=["candidate_id", "headline", "about", "skills", "sectors", "keywords", "languages", "projects", "certifications"]
            )

            
        except Exception as hybrid_error:
            print(f"[INFO] Hybrid search failed, falling back to simple vector search: {hybrid_error}")
            
            # Fallback to simple vector search with field filtering if available
            fallback_filter = None
            if search_expressions:
                fallback_filter = " OR ".join(search_expressions)
                print(f"[INFO] Using fallback filter: {fallback_filter}")
            
            search_results = collection.search(
                data=[query_embedding],
                anns_field="embedding",
                param=search_params,
                limit=query.top_k,
                expr=fallback_filter,  # Apply field filtering in fallback too
                output_fields=["candidate_id", "headline", "about", "skills", "sectors", "keywords", "languages", "projects", "certifications"]
            )
        
        if not search_results or not search_results[0]:
            return {
                "status": "No Results",
                "message": "No candidates found in the vector database",
                "count": 0,
                "recommendations": []
            }
        # Initialize variables for batch processing
        batch_size = int(CANDIDATE_SCORE_BATCH_SIZE) or 5
        batch_data = []
        recommendations = []

        # Step 1: Collect candidate data for batching
       
        print(f"[INFO] Collecting candidate data from {len(search_results[0])} search results")
  
        for hit in search_results[0]:
            try:
                candidate_id = hit.entity.get("candidate_id")
                cached_response = get_existing_response(query.job_id, candidate_id)
                if cached_response:
                    print(f"[CACHE HIT] Job {query.job_id}, Candidate {candidate_id}")
                    parsed_json = json.loads(cached_response)
                    recommendations.append({ "job_id": query.job_id,
                    "candidate_id": candidate_id,
                    "similarity_score": parsed_json.get("jobFitAnalysis").get("overallMatchScore"),
                    "data": parsed_json
                    })
                    continue
                else:
                    candidate_data = {
                        "headline": hit.entity.get("headline", ""),
                        "about": hit.entity.get("about", ""),
                        "skills": hit.entity.get("skills", ""),
                        "sectors": hit.entity.get("sectors", ""),
                        "keywords": hit.entity.get("keywords", ""),
                        "languages": hit.entity.get("languages", ""),
                        "projects": hit.entity.get("projects", ""),
                        "certifications": hit.entity.get("certifications", "")
                    }   
                    candidate_text_parts = []
                    for field, value in candidate_data.items():
                        if value and str(value).strip():
                            if field == "skills":
                                candidate_text_parts.append(f"`Skills: {str(value)}`")
                            candidate_text_parts.append(str(value))
                    
                    candidate_combined_text = " ".join(candidate_text_parts)

                    batch_data.append({
                        "candidate_id": candidate_id,
                        "candidate_combined_text": candidate_combined_text,
                    })
            except Exception as e:
                print(f"Error preparing candidate data: {e}")
                continue

        # Step 2: Process candidates in batches through LLM
        
        print(f"[INFO] Processing {len(batch_data)} candidates in batches of {batch_size}")
       
        client = AsyncOpenAI(api_key=OPENAI_API_KEY)

        async def matched_score_llm_async(system_message, resume_text, job_description):
            response = await client.chat.completions.create(
                model=LLM_MODEL  or "gpt-4o-mini",
                messages=[
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": f"Job Description:\n{job_description}\n\nCandidates:\n{resume_text}"}
                ],
                temperature=1
            )
            return response.choices[0].message.content

        # Build async tasks for all batches
        tasks = []
        candidate_batches = []
        for i in range(0, len(batch_data), batch_size):
            current_batch = batch_data[i:i + batch_size]
            batch_candidates_text = []
            candidate_ids_in_batch = []
            
            for idx, c in enumerate(current_batch):
                candidate_ids_in_batch.append(c["candidate_id"])
                candidate_text = f"""
                Candidate {idx + 1} (ID: {c["candidate_id"]}):
                Resume: {c["candidate_combined_text"]}
                ---
                """
                batch_candidates_text.append(candidate_text)

            combined_batch_text = "\n".join(batch_candidates_text)
            prompt = base_batch_prompt()

            tasks.append(matched_score_llm_async(
                system_message=prompt,
                resume_text=combined_batch_text,
                job_description=job_description
            ))
            candidate_batches.append((current_batch, candidate_ids_in_batch))

        # Run all batch calls concurrently
        results = await asyncio.gather(*tasks, return_exceptions=True)

        # Process results
        for (current_batch, candidate_ids_in_batch), batch_response in zip(candidate_batches, results):
            if isinstance(batch_response, Exception):
                print(f"[ERROR] LLM call failed: {batch_response}")
                continue

            print(f"[INFO] Batch response received: {len(str(batch_response))} characters")
            parsed_batch = safe_parse_batch_json(batch_response)
            

            if not isinstance(parsed_batch, list):
                print(f"[ERROR] Expected list, got {type(parsed_batch)}")
                continue

            if len(parsed_batch) != len(current_batch):
                print(f"[WARNING] Mismatch in candidate counts")
                min_length = min(len(parsed_batch), len(current_batch))
                parsed_batch = parsed_batch[:min_length]
                current_batch = current_batch[:min_length]
                candidate_ids_in_batch = candidate_ids_in_batch[:min_length]

            for candidate, parsed_json, candidate_id in zip(current_batch, parsed_batch, candidate_ids_in_batch):
                stored_candidate_id = candidate["candidate_id"]
                updated_parsed_json = update_overall_match_score(parsed_json)
                save_response_async(query.job_id, stored_candidate_id, json.dumps(updated_parsed_json), background_tasks)

                response = {
                    "job_id": query.job_id,
                    "candidate_id": candidate_id,
                    "similarity_score": updated_parsed_json.get("jobFitAnalysis").get("overallMatchScore"),
                    "data": parsed_json
                }
                recommendations.append(response)
                
                print(f"[INFO] Processed candidate {stored_candidate_id} with overall score: {parsed_json.get('jobFitAnalysis', {}).get('overallMatchScore', 'N/A')}")

        print(f"[INFO] Successfully processed {len(recommendations)} candidates")
        return {
            "status": "Success",
            "count": len(recommendations),
            "recommendations": recommendations
        }
               
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error processing request: {str(e)}")


def get_existing_response(job_id, candidate_id):
    conn = sqlite3.connect("recommendations.db")
    cursor = conn.cursor()
    cursor.execute("""
        SELECT llm_response FROM recommendations 
        WHERE job_id = ? AND candidate_id = ?
    """, (job_id, candidate_id))
    row = cursor.fetchone()
    conn.close()
    return row[0] if row else None



def save_response_async(job_id, candidate_id, llm_response, background_tasks):
    # Run DB insert as background task so main flow is not blocked
    def _save():
        conn = sqlite3.connect("recommendations.db")
        cursor = conn.cursor()
        cursor.execute("""
            INSERT OR REPLACE INTO recommendations (job_id, candidate_id, llm_response)
            VALUES (?, ?, ?)
        """, (job_id, candidate_id, llm_response))
        conn.commit()
        conn.close()
    background_tasks.add_task(_save)



is_candiate_sync_running = False
is_request_pending = False


    
@app.post("/update_vectorstore/")
async def update_candidate_vectorstore_endpoint(request: UpdateRequest, background_tasks: BackgroundTasks):
    global is_candiate_sync_running, is_request_pending
    try:
        if is_candiate_sync_running:
            is_request_pending = True
            return {"status": "Success", "message": "candiate sync running please wait"}

        
        if request.force_full_rebuild:
            # Run a full rebuild using Milvus sync in the background
            background_tasks.add_task(
                sync_data_to_milvus,
                database_name=MILVUS_DB_NAME,
                candidate_collection_name=CANDIDATE_RECOMMANDATION_COLLECTION_NAME,
                job_collection_name="job_embeddings",
                sync_candidates=True,  # Only sync candidates for full rebuild
                sync_jobs=False,
                batch_size=100,
                force_recreate=True
            )
            return {"status": "Success", "message": "Full vector store rebuild started in the background using Milvus"}
        else:
            new_candidates = get_new_candidates()
            if not new_candidates:
                return {"status": "Success", "message": "No new candidates found, Milvus vector store is up to date"}
            
            is_candiate_sync_running = True
            background_tasks.add_task(add_candidates_to_milvus, new_candidates)
            return {
                "status": "Success", 
                "message": f"Milvus vector store update started in the background with {len(new_candidates)} new candidates"
            }
    except Exception as e:
        is_candiate_sync_running = False
        raise HTTPException(status_code=500, detail=f"Error updating Milvus vector store: {str(e)}")

@app.get("/vectorstore_status/")
async def get_vectorstore_status():
    try:
        # Check if vector store exists
        get_latest_files()
        load_candidate_mapping()
        
        if not current_index_path:
            return {"status": "Not Initialized", "message": "Vector store has not been created yet"}
        
        # Get vector store information
        index = faiss.read_index(current_index_path)
        total_vectors = index.ntotal
        
        return {
            "status": "Active",
            "total_candidates": total_vectors,
            "last_update": time.ctime(last_update_time) if last_update_time else "Unknown",
            "index_path": os.path.basename(current_index_path),
            "mapping_path": os.path.basename(current_mapping_path)
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error getting vector store status: {str(e)}")



@app.post("/recommend_jobs/")
async def recommend_jobs(query: CandidateQuery):
    try:
        # Get structured candidate details for hybrid search
        candidate_details = get_candidate_details_structured(candidate_id=query.candidate_id)
        candidate_text = candidate_details["combined_text"]
        print(f"Candidate text: {candidate_text}")
        print(f"Candidate skills: {candidate_details['skills']}")
        print(f"Candidate sectors: {candidate_details['sectors']}")
        print(f"Candidate keywords: {candidate_details['keywords']}")
        
        # Check if candidate has any filterable criteria
        candidate_has_requirements = bool(candidate_details['skills'] or candidate_details['sectors'] or candidate_details['keywords'])
        print(f"Candidate has requirements: {candidate_has_requirements}")
        
        candidate_doc = nlp(candidate_text)
        
        # Connect to Milvus
        try:
            connections.connect(
                alias="default",
                host=MILVUS_HOST,
                port=MILVUS_PORT
            )
            # Switch to the database
            db.using_database(MILVUS_DB_NAME)
        except Exception as e:
            raise HTTPException(status_code=500, detail=f"Failed to connect to Milvus: {str(e)}")
        
        # Check if job collection exists
        job_collection_name = JOB_RECOMMANDATION_COLLECTION_NAME
        if not utility.has_collection(job_collection_name):
            raise HTTPException(status_code=404, detail=f"Collection '{job_collection_name}' not found. Please sync job data first.")
        
        # Generate embedding for the candidate profile
        query_embedding = embeddings.embed_query(candidate_text)
        
        # Get job collection and handle loading
        job_collection = Collection(job_collection_name)
        
        # Ensure collection has an index and is loaded
        try:
            # Create index if it doesn't exist
            try:
                index_params = {
                    "metric_type": "IP",
                    "index_type": "IVF_FLAT",
                    "params": {
                        "nlist": 1024
                    }
                }
                job_collection.create_index(field_name="embedding", index_params=index_params)
                print(f"[INFO] Index ensured for job collection '{job_collection_name}'")
            except Exception as idx_error:
                print(f"[INFO] Job index exists or error: {idx_error}")
            
            # Load the collection
            job_collection.load()
            print(f"[INFO] Job collection '{job_collection_name}' loaded successfully")
            
        except Exception as load_error:
            print(f"[ERROR] Failed to prepare job collection: {load_error}")
            raise HTTPException(status_code=500, detail=f"Failed to prepare job collection: {str(load_error)}")
        
        # Prepare search parameters
        search_params = {
            "metric_type": "IP",  # Inner Product (same as Faiss IndexFlatIP)
            "params": {
                "nprobe": 128  # Number of probes for IVF_FLAT search
            }
        }
        
        # Prepare search expressions for candidate fields
        search_expressions = []
        
        # Create search filters based on candidate's profile (more flexible matching)
        if candidate_details['skills']:
            skills_list = [skill.strip() for skill in candidate_details['skills'].split(',') if skill.strip()]
            # Use key skills for filtering (limit to avoid too many expressions)
            key_skills = skills_list[:5]  # Limit to top 5 skills
            for skill in key_skills:
                if skill and len(skill) > 2:  # Only use skills with more than 2 characters
                    # Extract key words from compound skills
                    skill_words = skill.split()
                    for word in skill_words:
                        if len(word) > 3:  # Only significant words
                            search_expressions.append(f"skills like '%{word.lower()}%'")
        
        if candidate_details['sectors']:
            sectors_list = [sector.strip() for sector in candidate_details['sectors'].split(',') if sector.strip()]
            for sector in sectors_list:
                if sector and len(sector) > 2:
                    search_expressions.append(f"sectors like '%{sector.lower()}%'")
        
        if candidate_details['keywords']:
            keywords_list = [keyword.strip() for keyword in candidate_details['keywords'].split(',') if keyword.strip()]
            for keyword in keywords_list:
                if keyword and len(keyword) > 2:
                    search_expressions.append(f"keywords like '%{keyword.lower()}%'")
        
        print(f"Search expressions: {search_expressions}")
        
        # Use Milvus native hybrid search with field filtering for jobs
        try:
            search_requests = []
            
            # Always include a semantic search request as baseline
            semantic_request = AnnSearchRequest(
                data=[query_embedding],
                anns_field="embedding",
                param=search_params,
                limit=min(query.top_k * 2, 200),
                expr=None  # No filtering for semantic search
            )
            search_requests.append(semantic_request)
            
            # Add field-filtered search requests if candidate has requirements
            if search_expressions and candidate_has_requirements:
                search_limit = min(query.top_k * 2, 200)
                
                # Create a combined filter with OR logic for broader matching
                if search_expressions:
                    # Limit expressions to avoid overly complex queries
                    limited_expressions = search_expressions[:10]  # Use only top 10 expressions
                    combined_filter = " OR ".join(limited_expressions)
                    
                    filtered_request = AnnSearchRequest(
                        data=[query_embedding],
                        anns_field="embedding",
                        param=search_params,
                        limit=search_limit,
                        expr=combined_filter
                    )
                    search_requests.append(filtered_request)
            
            # Perform hybrid search
            if len(search_requests) > 1:
                search_results = job_collection.hybrid_search(
                    reqs=search_requests,
                    rerank=RRFRanker(),  # Use Reciprocal Rank Fusion
                    limit=query.top_k * 2,  # Get more results for final processing
                    output_fields=["job_id", "title", "description", "skills", "keywords", "languages", "sectors"]
                )
            else:
                # Fallback to single semantic search
                search_results = job_collection.search(
                    data=[query_embedding],
                    anns_field="embedding",
                    param=search_params,
                    limit=query.top_k * 2,
                    output_fields=["job_id", "title", "description", "skills", "keywords", "languages", "sectors"]
                )
                
        except Exception as hybrid_error:
            print(f"[INFO] Hybrid search failed, falling back to simple vector search: {hybrid_error}")
            
            # Fallback to simple vector search with field filtering if available
            fallback_filter = None
            if search_expressions:
                fallback_filter = " OR ".join(search_expressions)
                print(f"[INFO] Using fallback filter: {fallback_filter}")
            
            search_results = job_collection.search(
                data=[query_embedding],
                anns_field="embedding",
                param=search_params,
                limit=min(query.top_k * 3, 300),
                expr=fallback_filter,  # Apply field filtering in fallback too
                output_fields=["job_id", "title", "description", "skills", "keywords", "languages", "sectors"]
            )
        
        if not search_results or not search_results[0]:
            return {
                "status": "No Results",
                "message": "No jobs found in the vector database",
                "count": 0,
                "recommendations": []
            }
        
        # Process results using Milvus data directly (no need for additional SQL queries)
        recommendations = []
        
        for hit in search_results[0]:
            try:
                # Access entity data differently for Milvus
                entity = hit.entity
                job_id = entity.get("job_id") if hasattr(entity, 'get') else entity.job_id
                embedding_similarity_score = float(hit.score)  # Milvus similarity score
                
                # Get job data from Milvus result - handle different entity access patterns
                if hasattr(entity, 'get'):
                    job_data = {
                        "title": entity.get("title", ""),
                        "description": entity.get("description", ""),
                        "skills": entity.get("skills", ""),
                        "keywords": entity.get("keywords", ""),
                        "languages": entity.get("languages", ""),
                        "sectors": entity.get("sectors", "")
                    }
                else:
                    job_data = {
                        "title": getattr(entity, "title", ""),
                        "description": getattr(entity, "description", ""),
                        "skills": getattr(entity, "skills", ""),
                        "keywords": getattr(entity, "keywords", ""),
                        "languages": getattr(entity, "languages", ""),
                        "sectors": getattr(entity, "sectors", "")
                    }
                
                # Create concatenated text for similarity matching (same as before)
                rich_job_text = " ".join([
                    str(job_data['title']),
                    str(job_data['description']),
                    str(job_data['skills']),
                    str(job_data['keywords']),
                    str(job_data['languages']),
                    str(job_data['sectors'])
                ])
                
                # Calculate additional similarity scores for hybrid approach  
                rapid_fuzz_score = fuzz.token_set_ratio(candidate_text, rich_job_text) / 100.0
                
                # SpaCy similarity score
                job_doc = nlp(rich_job_text)
                spacy_similarity_score = job_doc.similarity(candidate_doc)
                
                # Calculate field-specific matching scores
                def calculate_field_overlap(candidate_field, job_field):
                    if not candidate_field or not job_field:
                        return 0.0
                    
                    candidate_items = set(item.strip().lower() for item in candidate_field.split(',') if item.strip())
                    job_items = set(item.strip().lower() for item in job_field.split(',') if item.strip())
                    
                    if not candidate_items or not job_items:
                        return 0.0
                    
                    overlap = len(candidate_items.intersection(job_items))
                    return overlap / len(candidate_items) if candidate_items else 0.0
                
                # Calculate individual field scores
                skills_match_score = calculate_field_overlap(candidate_details['skills'], job_data['skills'])
                sectors_match_score = calculate_field_overlap(candidate_details['sectors'], job_data['sectors'])
                keywords_match_score = calculate_field_overlap(candidate_details['keywords'], job_data['keywords'])
                
                # Calculate weighted field match score
                field_match_score = (
                    0.5 * skills_match_score +
                    0.3 * sectors_match_score +
                    0.2 * keywords_match_score
                )
                
                # Use Milvus hybrid score as primary, enhanced with field matching
                milvus_hybrid_score = embedding_similarity_score
                
                # Calculate final hybrid score with field matching consideration
                final_hybrid_score = (
                    0.80 * milvus_hybrid_score +
                    0.15 * rapid_fuzz_score +
                    0.05 * spacy_similarity_score
                )
                
                # Apply field match boost for better relevance
                if field_match_score > 0:
                    final_hybrid_score = min(1.0, final_hybrid_score + (field_match_score * 0.1))
                
                # Post-processing filter: prefer jobs with field matches but don't exclude all others
                # Only apply strict filtering if candidate has very specific requirements
                if candidate_has_requirements and len(search_expressions) > 5:
                    has_field_match = (skills_match_score > 0 or sectors_match_score > 0 or keywords_match_score > 0)
                    # Apply less strict filtering - only skip if no field matches AND low semantic similarity
                    if not has_field_match and milvus_hybrid_score < 0.6:
                        continue  # Skip only clearly irrelevant jobs
                
                # Create recommendation object with enhanced scores
                recommendations.append({
                    "job_id": str(job_id),
                    "title": job_data['title'],
                    "description": job_data['description'],
                    "skills": job_data['skills'],
                    "keywords": job_data['keywords'],
                    "languages": job_data['languages'],
                    "sectors": job_data['sectors'],
                    "similarity_score": final_hybrid_score,
                    "milvus_hybrid_score": float(milvus_hybrid_score),
                    "rapid_fuzz_score": float(rapid_fuzz_score),
                    "spacy_similarity_score": float(spacy_similarity_score),
                    "field_match_score": float(field_match_score),
                    "skills_match_score": float(skills_match_score),
                    "sectors_match_score": float(sectors_match_score),
                    "keywords_match_score": float(keywords_match_score),
                    "embedding_similarity_score": embedding_similarity_score,  # Keep for backward compatibility
                    "match_strength": (
                        "Strong" if final_hybrid_score >= 0.80 else
                        "Moderate" if final_hybrid_score >= 0.70 else
                        "Weak"
                    ),
                })
                
            except Exception as e:
                print(f"[ERROR] Error processing job: {e}")
                continue
        
        # Sort by similarity score (descending) - same as before
        recommendations.sort(key=lambda x: x["similarity_score"], reverse=True)
        
        # Split based on threshold - same logic as before
        above_threshold = [rec for rec in recommendations if rec["similarity_score"] >= query.threshold]
        below_threshold = [rec for rec in recommendations if rec["similarity_score"] < query.threshold]
        
        # Return same response structure as before
        if above_threshold:
            return {
                "status": "Success",
                "count": len(above_threshold),
                "recommendations": above_threshold
            }
        else:
            return {
                "status": "Below Threshold",
                "message": "All active jobs are below the similarity threshold",
                "count": len(below_threshold),
                "recommendations": below_threshold
            }
            
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error processing request: {str(e)}\n{error_detail}")



@app.post("/update_Job_vectorstore/")
async def update_vectorstore_endpoint(request: UpdateJobRequest, background_tasks: BackgroundTasks):
    try:
        if request.force_full_rebuild:
            # Run a full rebuild using Milvus sync in the background
            background_tasks.add_task(
                sync_data_to_milvus,
                database_name=MILVUS_DB_NAME,
                candidate_collection_name=CANDIDATE_RECOMMANDATION_COLLECTION_NAME,
                job_collection_name=JOB_RECOMMANDATION_COLLECTION_NAME,
                sync_candidates=False,  # Only sync jobs for full rebuild
                sync_jobs=True,
                batch_size=100,
                force_recreate=True
            )
            return {"status": "Success", "message": "Full job vector store rebuild started in the background using Milvus"}
        else:
            # Check for new jobs
            new_jobs = get_new_jobs()
                
            if not new_jobs:
                return {"status": "Success", "message": "No new jobs found, Milvus vector store is up to date"}
            else:
                # Add new jobs to Milvus in the background
                background_tasks.add_task(add_jobs_to_milvus, new_jobs)
                return {
                    "status": "Success", 
                    "message": f"Milvus vector store update started in the background with {len(new_jobs)} new jobs"
                }
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error updating Milvus vector store: {str(e)}")
    


# @app.on_event("startup")
# async def startup_event():
#     # Initialize the candidate vector store
#     get_vectorstore()
#     # Initialize the job vector store
#     get_job_vectorstore()


# Add this new model class to your existing models
class MatchQuery(BaseModel):
    job_id: int
    candidate_id: int



# Conditional import for legacy Faiss functions
try:
    import faiss
    FAISS_AVAILABLE = True
except ImportError:
    FAISS_AVAILABLE = False
    print("[INFO] Faiss not available - using Milvus only")

nlp = spacy.load("en_core_web_lg")  # Large model for better semantic similarity

@app.post("/match_job_candidate/")
async def match_job_candidate(query: MatchQuery):
    try:
        # Strategic data acquisition
        job_description = get_job_description_by_id_match(job_id=query.job_id)
        # print("get_job_description_by_id_match:", job_description)
        candidate_profile = get_candidate_profile_by_id_matched(candidate_id=query.candidate_id)
        # print("candidate_profile:", candidate_profile)
        
        # Get fuzzy score from existing function
        # result = compare_resume_with_jd_text(jd_text=job_description, candidate_profile=candidate_profile)
        if isinstance(candidate_profile, dict):
            candidate_text = extract_resume_text(candidate_profile)
            # Proceed with business logic
            print("Valid input: candidate_profile is a dictionary.")
        else:
            candidate_text=candidate_profile
            # Raise a flag or trigger exception handling
            print("Invalid input: candidate_profile is not a dictionary.")
        # print("candidate_text>>>>>>>>>>>:",candidate_text)
        # fuzzy_score = result['fuzzy_similarity_score']
        
        # Add RapidFuzz token_set_ratio
        rapid_fuzz_score = fuzz.token_set_ratio(job_description, candidate_text) / 100.0
        
        # Generate embeddings
        job_embedding = embeddings.embed_query(job_description)
        candidate_embedding = embeddings.embed_query(str(candidate_profile))

        # FAISS-based similarity
        job_embedding_np = np.array([job_embedding], dtype=np.float32)
        candidate_embedding_np = np.array([candidate_embedding], dtype=np.float32)

        dimension = job_embedding_np.shape[1]
        temp_index = faiss.IndexFlatIP(dimension)
        temp_index.add(job_embedding_np)

        distances, _ = temp_index.search(candidate_embedding_np, k=1)
        embedding_similarity_score = float(distances[0][0])

        # spaCy-based similarity
        job_doc = nlp(job_description)
        candidate_doc = nlp(str(candidate_text))
        spacy_similarity_score = job_doc.similarity(candidate_doc)

        # Governance over hybrid scoring - now incorporating the rapid_fuzz_score
        hybrid_score = (0.4 * spacy_similarity_score) + (0.6 * rapid_fuzz_score)

        # Response orchestration
        response = {
            "job_id": query.job_id,
            "candidate_id": query.candidate_id,
            "embedding_similarity_score": embedding_similarity_score,
            "spacy_similarity_score": spacy_similarity_score,
            "rapid_fuzz_score": rapid_fuzz_score,  # Added the rapidfuzz score
            "hybrid_score": hybrid_score,
            # "hybrid_score": fuzzy_score,
            "match_strength": (
                "Strong" if hybrid_score >= 0.80 else
                "Moderate" if hybrid_score >= 0.70 else
                "Weak"
            ),
            "above_threshold": hybrid_score >= 0.70
        }

        return {
            "status": "Success",
            "match_data": response
        }
    except Exception as e:
        # Defensive programming
        raise HTTPException(status_code=500, detail=f"Error processing request: {str(e)}")


@app.post("/match_job_candidate_score/")
async def match_job_candidate(query: MatchQuery):
    try:
        # Import rapidfuzz at the top of your file

        # Strategic data acquisition
        job_description = get_job_description_by_id_match(job_id=query.job_id)
        # print("job_description:",job_description)
        # print("get_job_description_by_id_match:", job_description)
        candidate_profile = get_candidate_profile_by_id_matched(candidate_id=query.candidate_id)
        # print("candidate_profile:",candidate_profile)
        # print("candidate_profile:", candidate_profile)
        
        # Get fuzzy score from existing function
        # result = compare_resume_with_jd_text(jd_text=job_description, candidate_profile=candidate_profile)
        if isinstance(candidate_profile, dict):
            candidate_text = extract_resume_text(candidate_profile)
            # Proceed with business logic
            print("Valid input: candidate_profile is a dictionary.")
        else:
            candidate_text=candidate_profile
            # Raise a flag or trigger exception handling
            print("Invalid input: candidate_profile is not a dictionary.")
                
        # print("candidate_text>>>>>>>>>>>:",candidate_text)
        # fuzzy_score = result['fuzzy_similarity_score']
        prompt = base_prompt()
        data  = matched_score_llm(system_message=prompt,resume_text=candidate_text,job_description=job_description,api_key=OPENAI_API_KEY)
        parsed_json = safe_parse_json(data)
        updated_json = update_overall_match_score(parsed_json)
        
        # Response orchestration
        response = {
            "job_id": query.job_id,
            "candidate_id": query.candidate_id,
            "data_score": updated_json
        }

        return {
            "status": "Success",
            "match_data": response
        }
    except Exception as e:
        # Defensive programming
        raise HTTPException(status_code=500, detail=f"Error processing request: {str(e)}")
    
def update_overall_match_score(data):
    # Navigate into category scores
    category_scores = data.get("jobFitAnalysis", {}).get("categoryScores", {})
    
    # Fields we want to average
    fields = [
        "hardSkillsAndTechStack",
        "experienceAndProjects",
        "toolsAndMethodologies",
        "softSkills",
        "educationAndBackground",
        "culturalAndTeamFit",
    ]
    
    # Extract values safely
    values = [category_scores.get(field, 0) for field in fields]
    
    # Compute average (avoid division by zero)
    overall = sum(values) / len(fields) if fields else 0
    
    # Update the overall match score
    data["jobFitAnalysis"]["overallMatchScore"] = round(overall, 2)  # keep 2 decimals
    
    return data



class Skill(BaseModel):
    id: int
    title: str


class QuestionItem(BaseModel):
    id: int
    question: str
    status: str
    question_order: int
    skill: Skill
    difficulty: str
    question_type: str
    start_time: str  # Format: HH:MM:SS
    video_url: str
    question_selected_option: Optional[str]  # Optional in case it's not yet answered

class VideoInput(BaseModel):
    interview_id:int
    m3u8_url: str
    questions: List[QuestionItem]

   # Default number of questions to generate

# from utils_video import convert_m3u8_to_mp4,extract_audio_from_video,process_video,chat_with_openai,process_audio_iterations,extract_question_data,extract__final_data,merge_transcripts

# @app.post("/upload-video/")
# async def upload_video(input: VideoInput):
#     try:
#         m3u8_url = input.m3u8_url
#         # questions = input.questions
#         questions_json = [q.dict() for q in input.questions]
#         interview_id = input.interview_id
#         # breakpoint()

#         # print(questions)
#         temp_output_mp4 = UPLOAD_DIR / "converted_video.mp4"
#         # breakpoint()
#         print("starting_ mp4 processing")
#         success = convert_m3u8_to_mp4(m3u8_url, str(temp_output_mp4))
#         print("Mp4 processing ends")

#         if not success:
#             raise HTTPException(status_code=500, detail="M3U8 conversion failed.")

#         video_path = temp_output_mp4
#         audio_path = UPLOAD_DIR / "extracted_audio.mp3"
#         print("Staring audio conversion")
#         extract_audio_from_video(video_path, audio_path)
#         print("audio conversion ends")
#         print("Starting transcription")
        
#         transcript_questions = process_audio_iterations(iterations=questions_json,audio_file_path=audio_path,api_key=OPENAI_API_KEY)
#         transcript_prompt = extract_question_data(transcript_questions)
#         # transcript = translate_audio_file(file_path=audio_path, api_key=OPENAI_API_KEY)
#         print("transcription ends")
#         print("Starting analysis")
#         prompt = prompt_analysis(transcript=transcript_prompt)
#         facial_result = process_video(video_path)
#         final_result = chat_with_openai(
#             system_message=prompt,
#             api_key=OPENAI_API_KEY,
#             facial_sentiments=facial_result,
#             quetions=questions_json  # Assuming typo of "quetions" from original
#         )

#         parsed_final = safe_parse_json(final_result)
#         final_transcript = extract__final_data(transcript_questions)
#         merged_transcript = merge_transcripts(transcript_prompt)
#         print("analysis ends")
       
#         import httpx
#         url = "https://hirenest.codiantdev.com/api/save-transcription"
 
#         payload = {
#             "message":"Video processed",
#             "interview_id":interview_id,
#             "transcript":final_transcript,
#             "final_result":parsed_final
#         }
#         print(payload)
#         async with httpx.AsyncClient() as client:
#             response = await client.post(url, json=payload)

#         return {"status_code": response.status_code, "response": response.json(),"output":payload} 

#         # return JSONResponse(content={
#         #     "message": "Video processed",
#         #     "interview_id":interview_id,
#         #     "transcript_data": final_transcript,
#         #     "merged_transcript":merged_transcript,
#         #     "final_result": parsed_final
#         # })

#     except Exception as e:
#         raise HTTPException(status_code=500, detail=str(e))



class CandidateQuery(BaseModel):
    candidate_id: int
    top_k: int = 20
    page: int = 1
    page_size: int = 10
    threshold: float = 0.5
    order_by: str = "relevance"
    search_key: str = ""
    skillFilter: str = ""  # Filter by skill_id instead of skill name
    sectorFilter: str = ""  # Filter by sector_id instead of sector name
    languageFilter: str = ""  # Filter by language_id instead of language name
    companySizeFilter: str = ""  # Filter by company_size_id
    countryFilter: str = ""  # Filter by country_id


@app.post("/recommend_jobs_pages/")
async def recommend_jobs(query: CandidateQuery):
    try:
        candidate_profile_dict = get_candidate_profile_by_id_matched(candidate_id=query.candidate_id)
        candidate_text = extract_resume_text(candidate_profile_dict) if isinstance(candidate_profile_dict, dict) else candidate_profile_dict
        candidate_doc = nlp(candidate_text)

        # Handle company_size - convert from string to int if not empty
        company_size_id = None
        if query.companySizeFilter and query.companySizeFilter.strip():
            try:
                company_size_id = int(query.companySizeFilter)
            except ValueError:
                raise HTTPException(status_code=400, detail="Invalid companySizeFilter format")
        else:
            # Check if candidate_profile_dict is a dictionary before using .get()
            if isinstance(candidate_profile_dict, dict):
                company_size_id = candidate_profile_dict.get("company_size_id")
        
        # If company_size_id is still None, we'll skip the company size filter

        query_embedding = embeddings.embed_query(candidate_text)
        query_embedding_np = np.array([query_embedding], dtype=np.float32)
        index = get_job_vectorstore()
        distances, indices = index.search(query_embedding_np, k=query.top_k)
        job_ids = [job_id_map.get(str(idx), "unknown") for idx in indices[0]]

        sql_query = """
        SELECT 
            a.id AS job_id,
            a.title AS title,
            a.text_description AS description,
            a.created_at AS created_at,
            GROUP_CONCAT(DISTINCT s.title) AS skill,
            GROUP_CONCAT(DISTINCT k.title) AS keyword,
            GROUP_CONCAT(DISTINCT l.title) AS language,
            GROUP_CONCAT(DISTINCT sec.sector_name) AS sector
        FROM 
            assignments AS a
        LEFT JOIN 
            users AS u ON a.user_id = u.id
        LEFT JOIN 
            assignment_skills AS ask ON a.id = ask.assignment_id
        LEFT JOIN 
            skills AS s ON ask.skill_id = s.id
        LEFT JOIN 
            assignment_keywords AS ak ON a.id = ak.assignment_id
        LEFT JOIN 
            keywords AS k ON ak.keyword_id = k.id
        LEFT JOIN 
            assignment_languages AS al ON a.id = al.assignment_id
        LEFT JOIN 
            languages AS l ON al.language_id = l.id
        LEFT JOIN 
            assignment_sectors AS asec ON a.id = asec.assignment_id
        LEFT JOIN 
            sectors AS sec ON asec.sector_id = sec.id
        LEFT JOIN 
            assignment_locations AS aloc ON a.id = aloc.assignment_id
        LEFT JOIN 
            user_addresses AS ua ON aloc.address_id = ua.id
        WHERE
            a.status = 'active' AND
            a.id IN ({}) 
            {} 
            {} 
            {} 
            {} 
            {}
        GROUP BY 
            a.id, a.title, a.text_description, a.created_at
        """

        conn = get_db_connection()
        try:
            valid_job_ids = [jid for jid in job_ids if jid != "unknown"]
            if not valid_job_ids:
                return {
                    "status": "No Active Jobs",
                    "message": "No matching jobs found",
                    "count": 0,
                    "total_count": 0,
                    "page": query.page,
                    "page_size": query.page_size,
                    "total_pages": 0,
                    "recommendations": []
                }

            placeholders = ", ".join(["%s"] * len(valid_job_ids))
            params = valid_job_ids.copy()

            # Handle company_size filter - only add if companySizeFilter exists
            company_size_filter_clause = ""
            if company_size_id is not None:
                company_size_filter_clause = "AND u.company_size_id = %s"
                params.append(company_size_id)

            # Handle skill filter by skill_id
            skill_filter_clause = ""
            if query.skillFilter and query.skillFilter.strip():
                try:
                    skill_id = int(query.skillFilter)
                    skill_filter_clause = "AND ask.skill_id = %s"
                    params.append(skill_id)
                except ValueError:
                    raise HTTPException(status_code=400, detail="Invalid skillFilter format")

            # Handle sector filter by sector_id
            sector_filter_clause = ""
            if query.sectorFilter and query.sectorFilter.strip():
                try:
                    sector_id = int(query.sectorFilter)
                    sector_filter_clause = "AND asec.sector_id = %s"
                    params.append(sector_id)
                except ValueError:
                    raise HTTPException(status_code=400, detail="Invalid sectorFilter format")

            # Handle language filter by language_id
            language_filter_clause = ""
            if query.languageFilter and query.languageFilter.strip():
                try:
                    language_id = int(query.languageFilter)
                    language_filter_clause = "AND al.language_id = %s"
                    params.append(language_id)
                except ValueError:
                    raise HTTPException(status_code=400, detail="Invalid languageFilter format")

            # Handle country filter by country_id
            country_filter_clause = ""
            if query.countryFilter and query.countryFilter.strip():
                try:
                    country_id_int = int(query.countryFilter)
                    country_filter_clause = "AND ua.country_id = %s"
                    params.append(country_id_int)
                except ValueError:
                    raise HTTPException(status_code=400, detail="Invalid countryFilter format")

            # CRITICAL: This is the line that was causing the error - now fixed with all 6 arguments
            formatted_query = sql_query.format(
                placeholders, 
                company_size_filter_clause, 
                skill_filter_clause, 
                sector_filter_clause, 
                language_filter_clause, 
                country_filter_clause
            )
            
            jobs_df = pd.read_sql_query(formatted_query, conn, params=params)

            all_recommendations = []
            for i, job_id in enumerate(valid_job_ids):
                if job_id in jobs_df['job_id'].astype(str).values:
                    job_row = jobs_df[jobs_df['job_id'].astype(str) == job_id].iloc[0]
                    rich_job_text = " ".join([
                        str(job_row['title'] or ""),
                        str(job_row['description'] or ""),
                        str(job_row['skill'] or ""),
                        str(job_row['keyword'] or ""),
                        str(job_row['language'] or ""),
                        str(job_row['sector'] or "")
                    ])
                    dist_index = job_ids.index(job_id)
                    embedding_similarity_score = float(distances[0][dist_index])
                    created_at = job_row.get("created_at")

                    all_recommendations.append({
                        "job_id": job_id,
                        "title": job_row['title'],
                        "description": job_row['description'],
                        "skills": job_row['skill'],
                        "keywords": job_row['keyword'],
                        "languages": job_row['language'],
                        "sectors": job_row['sector'],
                        "created_at": created_at,
                        "embedding_similarity_score": embedding_similarity_score,
                        "similarity_score": embedding_similarity_score,
                        "match_strength": (
                            "Strong" if embedding_similarity_score >= 0.80 else
                            "Moderate" if embedding_similarity_score >= 0.70 else
                            "Weak"
                        ),
                    })

            # Sort recommendations
            if query.order_by == "newest":
                all_recommendations.sort(key=lambda x: x.get("created_at") or datetime.min, reverse=True)
            elif query.order_by == "oldest":
                all_recommendations.sort(key=lambda x: x.get("created_at") or datetime.min)
            else:
                all_recommendations.sort(key=lambda x: x["similarity_score"], reverse=True)

            # Apply threshold filter
            filtered_recommendations = [rec for rec in all_recommendations if rec["similarity_score"] >= query.threshold]

            # Apply search key filter (previously job_title_query)
            if query.search_key and query.search_key.strip():
                keyword = query.search_key.lower()
                filtered_recommendations = [rec for rec in filtered_recommendations if keyword in (rec["title"] or "").lower()]

            # Note: Skills, sectors, and languages are now filtered at the SQL level by their IDs
            # So we don't need the text-based filtering here anymore

            # Pagination
            total_count = len(filtered_recommendations)
            total_pages = (total_count + query.page_size - 1) // query.page_size
            start_index = (query.page - 1) * query.page_size
            end_index = start_index + query.page_size
            paginated_recommendations = filtered_recommendations[start_index:end_index]

            pagination_info = {
                "page": query.page,
                "page_size": query.page_size,
                "total_count": total_count,
                "total_pages": total_pages,
                "has_next": query.page < total_pages,
                "has_previous": query.page > 1
            }

            if total_count > 0:
                return {
                    "status": "Success",
                    "count": len(paginated_recommendations),
                    "recommendations": paginated_recommendations,
                    **pagination_info
                }
            else:
                return {
                    "status": "Below Threshold",
                    "message": "All active jobs are below the similarity threshold",
                    "count": 0,
                    "recommendations": [],
                    **pagination_info
                }

        finally:
            conn.close()

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error processing request: {str(e)}\n{error_detail}")







    

############################################################################################################################
#test
@app.post("/recommend_candidates_test/")
async def recommend_candidates(query: JobQuery):
    try:
        # Get job description and generate embedding
        job_description = get_job_description_by_id_match(job_id=query.job_id)
        # print("job_description:",job_description)
        job_doc = NLP(job_description)
        query_embedding = embeddings.embed_query(job_description)
        query_embedding_np = np.array([query_embedding], dtype=np.float32)

        # Get vector store and perform ANN search
        index = get_vectorstore()
        distances, indices = index.search(query_embedding_np, k=query.top_k)

        # Combine distances and indices
        results = list(zip(distances[0], indices[0].tolist()))

        recommendations = []
        for dist, idx in results:
            candidate_id = candidate_id_map.get(str(idx), "unknown")
            if candidate_id == "unknown":
                continue

            # Get candidate profile & enriched text
            candidate_profile = get_candidate_profile_by_id(candidate_id=candidate_id)
            # candidate_profile_dict = get_candidate_profile_by_id_matched(candidate_id=candidate_id)
            if isinstance(candidate_profile, dict):
                candidate_text = extract_resume_text(candidate_profile)
                # Proceed with business logic
                print("Valid input: candidate_profile is a dictionary.")
            else:
                candidate_text=candidate_profile

            candidate_doc = nlp(candidate_text)

            # Similarity metrics
            embedding_similarity_score = float(dist)
            rapid_fuzz_score = fuzz.token_set_ratio(job_description, candidate_text) / 100.0
            spacy_similarity_score = job_doc.similarity(candidate_doc)

            # Weighted hybrid score (tunable ratios)
            hybrid_score = (0.4 * spacy_similarity_score) + (0.6 * rapid_fuzz_score)

            # Strength classification
            match_strength = (
                "Strong" if hybrid_score >= 0.80 else
                "Moderate" if hybrid_score >= 0.70 else
                "Weak"
            )

            recommendations.append({
                "candidate_id": candidate_id,
                "embedding_similarity_score": embedding_similarity_score,
                "spacy_similarity_score": spacy_similarity_score,
                "rapid_fuzz_score": rapid_fuzz_score,
                "similarity_score": hybrid_score,
                "match_strength": match_strength
            })

        # Sort by hybrid score
        recommendations.sort(key=lambda x: x["similarity_score"], reverse=True)

        # Threshold filtering
        above_threshold = [rec for rec in recommendations if rec["similarity_score"] >= query.threshold]
        below_threshold = [rec for rec in recommendations if rec["similarity_score"] < query.threshold]

        if above_threshold:
            return {
                "status": "Success",
                "count": len(above_threshold),
                "recommendations": above_threshold
            }
        else:
            return {
                "status": "Below Threshold",
                "message": "All candidates are below the similarity threshold",
                "count": len(below_threshold),
                "recommendations": below_threshold
            }

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error processing request: {str(e)}\n{error_detail}")
    

    
class Skill(BaseModel):
    id: str
    title: str
class QuestionGenerationRequest(BaseModel):
    job_id: int
    candidate_id: int
    num_questions: int = 5  # Default still valid but can be overridden
    score:  Optional[Union[int, str]] = None            # Assuming you meant numeric score, not str
    skills: List[Skill]

from enum import Enum

class Options(Enum):
    aptitude = "aptitude"
    General = "General"


class GeneralQuestionGenerationRequest(BaseModel):
    job_id: int
    candidate_id: int
    num_questions: int = 5  # Default still valid but can be overridden
    score:  Optional[Union[int, str]] = None            # Assuming you meant numeric score, not str
    skills: List[Skill]
    options: str  # Changed from Options enum to string to accept the curl parameter
    topic: Optional[str] = None  # Added optional topic parameter
    level: str  # Added optional level parameter

@app.post("/generate-interview-questions/")
async def generate_questions_endpoint(request: QuestionGenerationRequest):
    try:
        # Get job description and candidate profile
        job_description = get_job_description_by_id_match(job_id=request.job_id)
        candidate_profile = get_candidate_profile_by_id_matched(candidate_id=request.candidate_id)
        
        # Extract text from candidate profile if it's a dictionary
        if isinstance(candidate_profile, dict):
            candidate_text = extract_resume_text(candidate_profile)
        else:
            candidate_text = candidate_profile
        print("question generation started")
        # Generate questions
        questions = await generate_interview_questions(
            job_description=job_description,
            candidate_profile=candidate_text,
            num_questions=request.num_questions,
            skills=request.skills,
            OPENAI_API_KEY=OPENAI_API_KEY
        )
        print("question generation completed")
        return {
            "status": "Success",
            "job_id": request.job_id,
            "candidate_id": request.candidate_id,
            "questions": questions
        }

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error processing request: {str(e)}")

from prompt_template import generate_interview_questions_general

@app.post("/generate-interview-questions-general")
async def generate_questions_endpoint(request: GeneralQuestionGenerationRequest):
    try:
        # Get job description and candidate profile
        job_description = get_job_description_by_id_match(job_id=request.job_id)
        candidate_profile = get_candidate_profile_by_id_matched(candidate_id=request.candidate_id)
        
        # Extract text from candidate profile if it's a dictionary
        if isinstance(candidate_profile, dict):
            candidate_text = extract_resume_text(candidate_profile)
        else:
            candidate_text = candidate_profile
        print("question generation started")
        # Generate questions
        questions = await generate_interview_questions_general(
            job_description=job_description,
            candidate_profile=candidate_text,
            num_questions=request.num_questions,
            skills=request.skills,
            OPENAI_API_KEY=OPENAI_API_KEY,
            options=request.options,
            topic=request.topic,
            level=request.level
        )
        print("question generation completed",questions)
        return {
            "status": "Success",
            "job_id": request.job_id,
            "candidate_id": request.candidate_id,
            "questions": questions
        }

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error processing request: {str(e)}")

def clean_resume_text(text):
    # Normalize Unicode characters
    text = unicodedata.normalize('NFKC', text)

    # Replace newlines and carriage returns with spaces
    text = re.sub(r'[\r\n]+', ' ', text)

    # Replace multiple spaces with a single space
    text = re.sub(r'\s{2,}', ' ', text)

    # Replace various separators with standardized punctuation
    text = re.sub(r'[\|%]+', ' | ', text)  # pipes and % become separators
    text = re.sub(r',(?=\S)', ', ', text)  # ensure space after commas
    text = re.sub(r'\s*,\s*', ', ', text)  # clean up comma spacing

    # Remove redundant punctuation and extra spaces again
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'\s*\|\s*', ' | ', text)  # standardize pipe spacing

    # Deduplicate comma-separated and pipe-separated sections
    def deduplicate_sections(text):
        def dedup(s, sep):
            seen = set()
            items = []
            for item in [i.strip() for i in s.split(sep)]:
                key = item.lower()
                if key and key not in seen:
                    seen.add(key)
                    items.append(item)
            return f" {sep} ".join(items)

        # Deduplicate within comma-separated and pipe-separated blocks
        text = ' | '.join([dedup(block, ',') for block in text.split('|')])
        return text

    text = deduplicate_sections(text)

    # Final cleanup: strip leading/trailing spaces and fix spacing
    return text.strip()

def clean_resume_text_1(text):
    # Normalize Unicode characters
    text = unicodedata.normalize('NFKC', text)

    # Replace newlines and carriage returns with spaces
    text = re.sub(r'[\r\n]+', ' ', text)

    # Replace multiple spaces with a single space
    text = re.sub(r'\s{2,}', ' ', text)

    # Remove commas, pipes, slashes, and percent signs
    text = re.sub(r'[|,/%]+', ' ', text)

    # Remove extra spaces again
    text = re.sub(r'\s{2,}', ' ', text)

    # Final cleanup: strip leading/trailing spaces
    return text.strip()


class UpdateCandidateEmbeddingRequest(BaseModel):
    candidate_id: int


# Milvus-related Pydantic models
class MilvusDatabaseRequest(BaseModel):
    database_name: Optional[str] = None
    description: Optional[str] = None

class MilvusCollectionRequest(BaseModel):
    database_name: str
    collection_name: str
    description: Optional[str] = None

class FieldDefinition(BaseModel):
    name: str
    data_type: str  # "INT64", "FLOAT_VECTOR", "VARCHAR", etc.
    is_primary: bool = False
    auto_id: bool = False
    max_length: Optional[int] = None  # For VARCHAR fields
    dim: Optional[int] = None  # For vector fields

class MilvusSchemaRequest(BaseModel):
    database_name: str
    collection_name: str
    fields: List[FieldDefinition]
    description: Optional[str] = None

class MilvusDropCollectionRequest(BaseModel):
    database_name: str
    collection_name: str

class MilvusSyncRequest(BaseModel):
    database_name: str
    candidate_collection_name: Optional[str] = "candidates"
    job_collection_name: Optional[str] = "jobs"
    sync_candidates: bool = True
    sync_jobs: bool = True
    batch_size: int = 100
    force_recreate: bool = False


# @app.post("/update_candidate_embedding/")
# async def update_candidate_embedding(request: UpdateCandidateEmbeddingRequest):
#     """
#     Updates the embedding for a specific candidate in the FAISS index using the latest data.
#     Uses IndexIDMap for better ID management.
#     """
#     global current_index_path, last_update_time
#     try:
#         candidate_id = str(request.candidate_id)
#         # Load latest files and mapping
#         get_latest_files()
#         load_candidate_mapping()
#         if not current_index_path or not current_mapping_path:
#             raise HTTPException(status_code=404, detail="Vector store or mapping not found.")
#         if candidate_id not in candidate_id_map.values():
#             raise HTTPException(status_code=404, detail=f"Candidate ID {candidate_id} not found in mapping.")
        
#         # Find the index position for this candidate
#         index_pos = None
#         for idx, cid in candidate_id_map.items():
#             if cid == candidate_id:
#                 index_pos = int(idx)
#                 break
#         if index_pos is None:
#             raise HTTPException(status_code=404, detail=f"Candidate ID {candidate_id} not found in mapping.")
        
#         # Get latest candidate data
#         conn = get_db_connection()
#         try:
#             query = """
#             SELECT 
#                 GROUP_CONCAT(DISTINCT sk.title) AS skills,
#                 u.headline,
#                 GROUP_CONCAT(DISTINCT l.title) AS languages,
#                 GROUP_CONCAT(DISTINCT s.sector_name) AS sectors,
#                 u.about,
#                 GROUP_CONCAT(DISTINCT k.title) AS keywords,
#                 GROUP_CONCAT(DISTINCT CONCAT_WS('||', up.title, up.description, up.technologies) SEPARATOR '%%') AS projects,
#                 GROUP_CONCAT(DISTINCT CONCAT_WS('||', uc.name, uc.authority) SEPARATOR '%%') AS certifications
#             FROM 
#                 candidate_details cd
#             LEFT JOIN users u ON cd.candidate_id = u.id
#             LEFT JOIN user_sectors us ON cd.candidate_id = us.user_id
#             LEFT JOIN sectors s ON us.sector_id = s.id
#             LEFT JOIN user_keywords uk ON cd.candidate_id = uk.user_id
#             LEFT JOIN keywords k ON uk.keyword_id = k.id
#             LEFT JOIN user_skills usk ON cd.candidate_id = usk.user_id
#             LEFT JOIN skills sk ON usk.skill_id = sk.id
#             LEFT JOIN user_languages ul ON cd.candidate_id = ul.user_id
#             LEFT JOIN languages l ON ul.language_id = l.id
#             LEFT JOIN user_projects up ON cd.candidate_id = up.user_id
#             LEFT JOIN user_certifications uc ON cd.candidate_id = uc.user_id
#             WHERE 
#                 cd.candidate_id = %s
#             GROUP BY 
#                 u.headline, u.about
#             """
#             df = pd.read_sql_query(query, conn, params=[candidate_id])
#             if df.empty:
#                 raise HTTPException(status_code=404, detail=f"Candidate ID {candidate_id} not found in DB.")
#         finally:
#             conn.close()
        
#         # Process candidate data
#         row = df.iloc[0]
#         candidate_text_parts = []
#         for col in df.columns:
#             value = row[col]
#             if pd.notnull(value):
#                 candidate_text_parts.append(str(value))
        
#         candidate_text = " ".join(candidate_text_parts)
#         print("candidate_text:", candidate_text)
#         cleaned_candidate_text = clean_resume_text(candidate_text)
#         # cleaned_candidate_text = "Security Officer  Crowd control and event security Patrolling Security Training"
        
#         # Generate new embedding
#         new_embedding = embeddings.embed_query(cleaned_candidate_text)
#         new_embedding_np = np.array([new_embedding], dtype=np.float32)
        
#         # Load existing index
#         existing_index = faiss.read_index(current_index_path)
#         print("index_pos:", index_pos)
#         print("existing_index.ntotal:", existing_index.ntotal)
        
#         if index_pos >= existing_index.ntotal:
#             raise HTTPException(status_code=400, detail="Index position out of range.")
        
#         # Check if the existing index is already an IndexIDMap
#         if isinstance(existing_index, faiss.IndexIDMap):
#             # Use existing IndexIDMap
#             index = existing_index
#         else:
#             # Create IndexIDMap wrapper for better ID management
#             embedding_dimension = new_embedding_np.shape[1]
#             index_base = faiss.IndexFlatIP(embedding_dimension)
#             index = faiss.IndexIDMap(index_base)
            
#             # Reconstruct all embeddings and their IDs
#             all_embeddings = existing_index.reconstruct_n(0, existing_index.ntotal)
            
#             # Create ID array (assuming sequential IDs from 0 to ntotal-1)
#             all_ids = np.arange(existing_index.ntotal, dtype=np.int64)
            
#             # Add all existing embeddings with their IDs
#             index.add_with_ids(all_embeddings, all_ids)
        
#         # When updating - use the simplified approach
#         target_id = index_pos
        
#         # Remove old embedding
#         index.remove_ids(np.array([target_id], dtype=np.int64))
        
#         # Add new embedding with same ID
#         index.add_with_ids(
#             new_embedding_np.reshape(1, -1), 
#             np.array([target_id], dtype=np.int64)
#         )
        
#         # Save new index with timestamp
#         timestr = time.strftime("%Y%m%d-%H%M%S")
#         new_index_path = os.path.join(output_dir, f"faiss_index_candidates_{timestr}")
#         faiss.write_index(index, new_index_path)
        
#         # Update global pointer and timestamp
#         current_index_path = new_index_path
#         last_update_time = time.time()
        
#         return {
#             "status": "Success", 
#             "message": f"Embedding for candidate {candidate_id} updated at index position {index_pos}.", 
#             "index_path": os.path.basename(new_index_path)
#         }
        
#     except Exception as e:
#         raise HTTPException(status_code=500, detail=f"Error updating candidate embedding: {str(e)}\n{error_detail}")



@app.post("/update_candidate_embedding/")
async def update_candidate_embedding(request: UpdateCandidateEmbeddingRequest):
    """
    Updates the embedding for a specific candidate in Milvus using the latest data.
    Skills are weighted more heavily in embedding generation.
    """
    try:
        candidate_id = str(request.candidate_id)

        # Connect to Milvus
        try:
            connections.connect(
                alias="default",
                host=MILVUS_HOST,
                port=MILVUS_PORT
            )
            db.using_database(MILVUS_DB_NAME)
        except Exception as e:
            raise HTTPException(status_code=500, detail=f"Failed to connect to Milvus: {str(e)}")

        # Check if collection exists
        collection_name = CANDIDATE_RECOMMANDATION_COLLECTION_NAME
        if not utility.has_collection(collection_name):
            raise HTTPException(status_code=404, detail=f"Collection '{collection_name}' not found. Please sync data first.")

        # Get collection
        collection = Collection(collection_name)
        
        # Load collection for querying
        try:
            collection.load()
        except Exception as load_error:
            print(f"[WARN] Could not load collection: {load_error}")

        # Check if candidate exists in Milvus
        try:
            existing_records = collection.query(
                expr=f"candidate_id == {candidate_id}",
                output_fields=["candidate_id"],
                limit=1
            )
            if not existing_records:
                raise HTTPException(status_code=404, detail=f"Candidate ID {candidate_id} not found in Milvus collection.")
        except Exception as query_error:
            raise HTTPException(status_code=500, detail=f"Failed to check candidate existence: {query_error}")

        # Get latest candidate data from database
        conn = get_db_connection()
        try:
            query = """
            SELECT 
                GROUP_CONCAT(DISTINCT sk.title) AS skills,
                u.headline,
                GROUP_CONCAT(DISTINCT l.title) AS languages,
                GROUP_CONCAT(DISTINCT s.sector_name) AS sectors,
                u.about,
                GROUP_CONCAT(DISTINCT k.title) AS keywords,
                GROUP_CONCAT(DISTINCT CONCAT_WS('||', up.title, up.description, up.technologies) SEPARATOR '%%') AS projects,
                GROUP_CONCAT(DISTINCT CONCAT_WS('||', uc.name, uc.authority) SEPARATOR '%%') AS certifications
            FROM 
                candidate_details cd
            LEFT JOIN users u ON cd.candidate_id = u.id
            LEFT JOIN user_sectors us ON cd.candidate_id = us.user_id
            LEFT JOIN sectors s ON us.sector_id = s.id
            LEFT JOIN user_keywords uk ON cd.candidate_id = uk.user_id
            LEFT JOIN keywords k ON uk.keyword_id = k.id
            LEFT JOIN user_skills usk ON cd.candidate_id = usk.user_id
            LEFT JOIN skills sk ON usk.skill_id = sk.id
            LEFT JOIN user_languages ul ON cd.candidate_id = ul.user_id
            LEFT JOIN languages l ON ul.language_id = l.id
            LEFT JOIN user_projects up ON cd.candidate_id = up.user_id
            LEFT JOIN user_certifications uc ON cd.candidate_id = uc.user_id
            WHERE 
                cd.candidate_id = %s
            GROUP BY 
                u.headline, u.about
            """
            df = pd.read_sql_query(query, conn, params=[candidate_id])
            if df.empty:
                raise HTTPException(status_code=404, detail=f"Candidate ID {candidate_id} not found in database.")
        finally:
            conn.close()

        # Process candidate data with emphasis on skills and headline (same as Faiss version)
        row = df.iloc[0]
        candidate_text_parts = []

        # Emphasize skills by repeating them
        if pd.notnull(row["skills"]):
            repeated_skills = (" " + row["skills"]) * 5
            candidate_text_parts.append(repeated_skills)

        # Emphasize headline (job title)
        if pd.notnull(row["headline"]):
            repeated_headline = (" " + row["headline"]) * 3
            candidate_text_parts.append(repeated_headline)

        # Add remaining fields
        for col in df.columns:
            if col not in ["skills", "headline"]:
                value = row[col]
                if pd.notnull(value):
                    candidate_text_parts.append(str(value))

        candidate_text = " ".join(candidate_text_parts)
        print(f"[INFO] Candidate text: {candidate_text}")
        cleaned_candidate_text = clean_resume_text_1(candidate_text)
        print(f"[INFO] Cleaned candidate text: {cleaned_candidate_text}")

        # Generate new embedding
        new_embedding = embeddings.embed_query(cleaned_candidate_text)

        # Prepare updated candidate data for Milvus
        updated_candidate_data = {
            "candidate_id": int(candidate_id),
            "headline": str(row.get("headline", "") or ""),
            "about": str(row.get("about", "") or ""),
            "skills": str(row.get("skills", "") or ""),
            "sectors": str(row.get("sectors", "") or ""),
            "keywords": str(row.get("keywords", "") or ""),
            "languages": str(row.get("languages", "") or ""),
            "projects": str(row.get("projects", "") or ""),
            "certifications": str(row.get("certifications", "") or ""),
            "embedding": new_embedding
        }

        # Delete the old record and insert the new one (Milvus doesn't support direct updates)
        try:
            # Delete existing record
            delete_result = collection.delete(expr=f"candidate_id == {candidate_id}")
            print(f"[INFO] Deleted old candidate record: {delete_result}")
            
            # Insert updated record
            insert_result = collection.insert([updated_candidate_data])
            collection.flush()
            print(f"[INFO] Inserted updated candidate record: {insert_result}")
            
            # Reload collection to make changes available for search
            collection.load()
            

            remove_candidate_entries(candidate_id)
        except Exception as update_error:
            raise HTTPException(status_code=500, detail=f"Failed to update candidate in Milvus: {str(update_error)}")

        return {
            "status": "Success",
            "message": f"Embedding for candidate {candidate_id} updated successfully in Milvus.",
            "candidate_id": candidate_id
        }

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error updating candidate embedding: {str(e)}")



def remove_candidate_entries(candidate_id: int):
    """
    Remove all entries in the recommendations table related to the given candidate_id.
    """
    try:
        conn = sqlite3.connect(DB_FILE)
        cursor = conn.cursor()

        cursor.execute(
            "DELETE FROM recommendations WHERE candidate_id = ?",
            (candidate_id,)
        )
        conn.commit()
        deleted_rows = cursor.rowcount
        conn.close()

        return {"status": "success", "deleted_rows": deleted_rows}

    except Exception as e:
        return {"status": "error", "message": str(e)}




# Milvus Configuration
MILVUS_HOST = config('MILVUS_HOST', default="localhost")
MILVUS_PORT = config('MILVUS_PORT', default="19530")

def get_milvus_connection():
    """Establish connection to Milvus server"""
    try:
        connections.connect(
            alias="default",
            host=MILVUS_HOST,
            port=MILVUS_PORT
        )
        return True
    except Exception as e:
        print(f"Failed to connect to Milvus: {e}")
        return False


@app.post("/milvus/create_database/")
async def create_milvus_database(request: MilvusDatabaseRequest):
    """
    Create a new Milvus database.
    
    Args:
        request: MilvusDatabaseRequest containing database_name and optional description
    
    Returns:
        dict: Success message with database details
    """
    try:
        # Establish connection
        if not get_milvus_connection():
            raise HTTPException(status_code=500, detail="Failed to connect to Milvus server")
        
        # Check if database already exists
        if not request.database_name:
            request.database_name = MILVUS_DB_NAME
        existing_databases = db.list_database()
        if request.database_name in existing_databases:
            return {
                "status": "warning",
                "message": f"Database '{request.database_name}' already exists",
                "database_name": request.database_name,
                "existing_databases": existing_databases
            }
        
        # Create database
        db.create_database(db_name=request.database_name)
        
        # Verify creation
        updated_databases = db.list_database()
        
        return {
            "status": "success",
            "message": f"Database '{request.database_name}' created successfully",
            "database_name": request.database_name,
            "description": request.description,
            "all_databases": updated_databases
        }
        
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error creating Milvus database: {str(e)}")



@app.post("/milvus/create_candidate_schema/")
async def create_candidate_schema(request: MilvusCollectionRequest):
    """
    Create a specialized schema for candidate data based on the database structure.
    
    Schema includes:
    - candidate_id (INT64, primary key)
    - headline (VARCHAR)
    - about (VARCHAR) 
    - sectors (VARCHAR)
    - keywords (VARCHAR)
    - skills (VARCHAR)
    - languages (VARCHAR)
    - projects (VARCHAR)
    - certifications (VARCHAR)
    - embedding (FLOAT_VECTOR[1536])
    
    Args:
        request: MilvusCollectionRequest containing database_name, collection_name, and optional description
    
    Returns:
        dict: Success message with schema details
    """
    try:
        # Establish connection
        if not get_milvus_connection():
            raise HTTPException(status_code=500, detail="Failed to connect to Milvus server")
        
        # Switch to the specified database
        db.using_database(request.database_name)
        
        # Check if collection exists and drop it if it does (to recreate with new schema)
        if utility.has_collection(request.collection_name):
            collection = Collection(request.collection_name)
            collection.drop()
        
        # Define candidate-specific schema fields based on the DataFrame columns
        candidate_fields = [
            FieldSchema(
                name="candidate_id", 
                dtype=DataType.INT64, 
                is_primary=True,
                auto_id=False,
                description="Unique candidate identifier from database"
            ),
            FieldSchema(
                name="headline", 
                dtype=DataType.VARCHAR, 
                max_length=3000,
                description="Candidate job title/headline from users.headline"
            ),
            FieldSchema(
                name="about", 
                dtype=DataType.VARCHAR, 
                max_length=16000,
                description="Candidate about/summary section from users.about"
            ),
            FieldSchema(
                name="sectors", 
                dtype=DataType.VARCHAR, 
                max_length=5000,
                description="Comma-separated list of sectors from GROUP_CONCAT(sectors.sector_name)"
            ),
            FieldSchema(
                name="keywords", 
                dtype=DataType.VARCHAR, 
                max_length=5000,
                description="Comma-separated list of keywords from GROUP_CONCAT(keywords.title)"
            ),
            FieldSchema(
                name="skills", 
                dtype=DataType.VARCHAR, 
                max_length=16000,
                description="Comma-separated list of skills from GROUP_CONCAT(skills.title)"
            ),
            FieldSchema(
                name="languages", 
                dtype=DataType.VARCHAR, 
                max_length=5000,
                description="Comma-separated list of languages from GROUP_CONCAT(languages.title)"
            ),
            FieldSchema(
                name="projects", 
                dtype=DataType.VARCHAR, 
                max_length=16000,
                description="Projects data concatenated with separators from user_projects"
            ),
            FieldSchema(
                name="certifications", 
                dtype=DataType.VARCHAR, 
                max_length=16000,
                description="Certifications data concatenated with separators from user_certifications"
            ),
            FieldSchema(
                name="embedding", 
                dtype=DataType.FLOAT_VECTOR, 
                dim=1536,
                description="OpenAI text-embedding-ada-002 vector representation"
            )
        ]
        
        # Create schema
        schema = CollectionSchema(
            fields=candidate_fields,
            description=request.description or f"Candidate data schema for recommendation system with structured fields and embeddings"
        )
        
        # Create collection with candidate schema
        collection = Collection(
            name=request.collection_name,
            schema=schema,
            using='default'
        )
        
        # Create IVF_FLAT index for the embedding field
        try:
            print(f"[INFO] Creating IVF_FLAT index for collection '{request.collection_name}'...")
            index_params = {
                "metric_type": "IP",  # Inner Product for similarity
                "index_type": "IVF_FLAT",
                "params": {
                    "nlist": 1024  # Number of cluster units, good for most datasets
                }
            }
            collection.create_index(field_name="embedding", index_params=index_params)
            print(f"[INFO] IVF_FLAT index created successfully for collection '{request.collection_name}'")
        except Exception as index_error:
            print(f"[WARN] Index creation warning: {index_error}")
            # Continue even if index creation fails
        
        # Get schema details for response
        schema_details = []
        for field in candidate_fields:
            field_info = {
                "name": field.name,
                "data_type": field.dtype.name,
                "is_primary": field.is_primary,
                "auto_id": field.auto_id,
                "description": field.description or ""
            }
            
            if hasattr(field, 'dim'):
                field_info["dimension"] = field.dim
            if hasattr(field, 'max_length'):
                field_info["max_length"] = field.max_length
                
            schema_details.append(field_info)
        
        return {
            "status": "success",
            "message": f"Candidate schema created successfully for collection '{request.collection_name}' in database '{request.database_name}'",
            "database_name": request.database_name,
            "collection_name": request.collection_name,
            "schema_type": "candidate_recommendation",
            "schema_details": schema_details,
            "total_fields": len(candidate_fields),
            "field_mapping": {
                "candidate_id": "Primary key from candidate_details.candidate_id",
                "headline": "From users.headline", 
                "about": "From users.about",
                "sectors": "GROUP_CONCAT(sectors.sector_name)",
                "keywords": "GROUP_CONCAT(keywords.title)",
                "skills": "GROUP_CONCAT(skills.title)",
                "languages": "GROUP_CONCAT(languages.title)",
                "projects": "GROUP_CONCAT(user_projects data)",
                "certifications": "GROUP_CONCAT(user_certifications data)",
                "embedding": "Generated from OpenAI embeddings API"
            },
            "note": "Schema matches the exact DataFrame structure: ['headline', 'about', 'sectors', 'keywords', 'skills', 'languages', 'projects', 'certifications'] + candidate_id + embedding"
        }
        
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error creating candidate schema: {str(e)}")

@app.post("/milvus/create_job_schema/")
async def create_job_schema(request: MilvusCollectionRequest):
    """
    Create a specialized schema for job data based on the database structure.
    
    Schema includes:
    - job_id (INT64, primary key)
    - title (VARCHAR)
    - description (VARCHAR)
    - skills (VARCHAR)
    - keywords (VARCHAR)
    - languages (VARCHAR)
    - sectors (VARCHAR)
    - embedding (FLOAT_VECTOR[1536])
    
    Args:
        request: MilvusCollectionRequest containing database_name, collection_name, and optional description
    
    Returns:
        dict: Success message with schema details
    """
    try:
        # Establish connection
        if not get_milvus_connection():
            raise HTTPException(status_code=500, detail="Failed to connect to Milvus server")
        
        # Switch to the specified database
        db.using_database(request.database_name)
        
        # Check if collection exists and drop it if it does (to recreate with new schema)
        if utility.has_collection(request.collection_name):
            collection = Collection(request.collection_name)
            collection.drop()
        
        # Define job-specific schema fields
        job_fields = [
            FieldSchema(
                name="job_id", 
                dtype=DataType.INT64, 
                is_primary=True,
                auto_id=False,
                description="Unique job identifier from assignments table"
            ),
            FieldSchema(
                name="title", 
                dtype=DataType.VARCHAR, 
                max_length=5000,
                description="Job title from assignments.title"
            ),
            FieldSchema(
                name="description", 
                dtype=DataType.VARCHAR, 
                max_length=16000,
                description="Job description from assignments.text_description"
            ),
            FieldSchema(
                name="skills", 
                dtype=DataType.VARCHAR, 
                max_length=16000,
                description="Comma-separated list of required skills from GROUP_CONCAT(skills.title)"
            ),
            FieldSchema(
                name="keywords", 
                dtype=DataType.VARCHAR, 
                max_length=16000,
                description="Comma-separated list of job keywords from GROUP_CONCAT(keywords.title)"
            ),
            FieldSchema(
                name="languages", 
                dtype=DataType.VARCHAR, 
                max_length=5000,
                description="Comma-separated list of required languages from GROUP_CONCAT(languages.title)"
            ),
            FieldSchema(
                name="sectors", 
                dtype=DataType.VARCHAR, 
                max_length=16000,
                description="Comma-separated list of relevant sectors from GROUP_CONCAT(sectors.sector_name)"
            ),
            FieldSchema(
                name="embedding", 
                dtype=DataType.FLOAT_VECTOR, 
                dim=1536,
                description="OpenAI text-embedding-ada-002 vector representation"
            )
        ]
        
        # Create schema
        schema = CollectionSchema(
            fields=job_fields,
            description=request.description or f"Job data schema for recommendation system with structured fields and embeddings"
        )
        
        # Create collection with job schema
        collection = Collection(
            name=request.collection_name,
            schema=schema,
            using='default'
        )
        
        # Create IVF_FLAT index for the embedding field
        try:
            print(f"[INFO] Creating IVF_FLAT index for job collection '{request.collection_name}'...")
            index_params = {
                "metric_type": "IP",  # Inner Product for similarity
                "index_type": "IVF_FLAT",
                "params": {
                    "nlist": 1024  # Number of cluster units, good for most datasets
                }
            }
            collection.create_index(field_name="embedding", index_params=index_params)
            print(f"[INFO] IVF_FLAT index created successfully for job collection '{request.collection_name}'")
        except Exception as index_error:
            print(f"[WARN] Index creation warning: {index_error}")
            # Continue even if index creation fails
        
        # Get schema details for response
        schema_details = []
        for field in job_fields:
            field_info = {
                "name": field.name,
                "data_type": field.dtype.name,
                "is_primary": field.is_primary,
                "auto_id": field.auto_id,
                "description": field.description or ""
            }
            
            if hasattr(field, 'dim'):
                field_info["dimension"] = field.dim
            if hasattr(field, 'max_length'):
                field_info["max_length"] = field.max_length
                
            schema_details.append(field_info)
        
        return {
            "status": "success",
            "message": f"Job schema created successfully for collection '{request.collection_name}' in database '{request.database_name}'",
            "database_name": request.database_name,
            "collection_name": request.collection_name,
            "schema_type": "job_recommendation",
            "schema_details": schema_details,
            "total_fields": len(job_fields),
            "field_mapping": {
                "job_id": "Primary key from assignments.id",
                "title": "From assignments.title",
                "description": "From assignments.text_description", 
                "skills": "GROUP_CONCAT(skills.title)",
                "keywords": "GROUP_CONCAT(keywords.title)",
                "languages": "GROUP_CONCAT(languages.title)",
                "sectors": "GROUP_CONCAT(sectors.sector_name)",
                "embedding": "Generated from OpenAI embeddings API"
            },
            "note": "Schema designed for job recommendation system matching the assignments table structure"
        }
        
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error creating job schema: {str(e)}")

@app.get("/milvus/list_databases/")
async def list_milvus_databases():
    """
    List all available Milvus databases.
    
    Returns:
        dict: List of all databases
    """
    try:
        if not get_milvus_connection():
            raise HTTPException(status_code=500, detail="Failed to connect to Milvus server")
        
        databases = db.list_database()
        
        return {
            "status": "success",
            "databases": databases,
            "count": len(databases)
        }
        
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error listing Milvus databases: {str(e)}")

@app.get("/milvus/list_collections/{database_name}")
async def list_milvus_collections(database_name: str):
    """
    List all collections in a specific Milvus database.
    
    Args:
        database_name: Name of the database
    
    Returns:
        dict: List of collections in the database
    """
    try:
        if not get_milvus_connection():
            raise HTTPException(status_code=500, detail="Failed to connect to Milvus server")
        
        # Switch to the specified database
        db.using_database(database_name)
        
        collections = utility.list_collections()
        
        return {
            "status": "success",
            "database_name": database_name,
            "collections": collections,
            "count": len(collections)
        }
        
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error listing collections in database '{database_name}': {str(e)}")

@app.get("/milvus/collection_info/{database_name}/{collection_name}")
async def get_collection_info(database_name: str, collection_name: str):
    """
    Get detailed information about a specific collection.
    
    Args:
        database_name: Name of the database
        collection_name: Name of the collection
    
    Returns:
        dict: Detailed collection information including schema
    """
    try:
        if not get_milvus_connection():
            raise HTTPException(status_code=500, detail="Failed to connect to Milvus server")
        
        # Switch to the specified database
        db.using_database(database_name)
        
        if not utility.has_collection(collection_name):
            raise HTTPException(status_code=404, detail=f"Collection '{collection_name}' not found in database '{database_name}'")
        
        collection = Collection(collection_name)
        schema = collection.schema
        
        # Extract field information
        fields_info = []
        for field in schema.fields:
            field_info = {
                "name": field.name,
                "data_type": field.dtype.name,
                "is_primary": field.is_primary,
                "auto_id": field.auto_id,
                "description": field.description or ""
            }
            
            if hasattr(field, 'dim'):
                field_info["dimension"] = field.dim
            if hasattr(field, 'max_length'):
                field_info["max_length"] = field.max_length
                
            fields_info.append(field_info)
        
        return {
            "status": "success",
            "database_name": database_name,
            "collection_name": collection_name,
            "description": schema.description,
            "fields": fields_info,
            "total_fields": len(fields_info)
        }
        
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error getting collection info: {str(e)}")

@app.delete("/milvus/drop_collection/")
async def drop_milvus_collection(request: MilvusDropCollectionRequest):
    """
    Drop (delete) a Milvus collection.
    
    Args:
        request: MilvusDropCollectionRequest containing database_name and collection_name
    
    Returns:
        dict: Success message confirming collection deletion
    """
    try:
        # Establish connection
        if not get_milvus_connection():
            raise HTTPException(status_code=500, detail="Failed to connect to Milvus server")
        
        # Switch to the specified database
        db.using_database(request.database_name)
        
        # Check if collection exists
        if not utility.has_collection(request.collection_name):
            raise HTTPException(
                status_code=404, 
                detail=f"Collection '{request.collection_name}' not found in database '{request.database_name}'"
            )
        
        # Get collection info before dropping (for response)
        collection = Collection(request.collection_name)
        schema = collection.schema
        field_count = len(schema.fields)
        
        # Drop the collection
        collection.drop()
        
        # Verify collection is dropped
        if utility.has_collection(request.collection_name):
            raise HTTPException(
                status_code=500, 
                detail=f"Failed to drop collection '{request.collection_name}'"
            )
        
        return {
            "status": "success",
            "message": f"Collection '{request.collection_name}' dropped successfully from database '{request.database_name}'",
            "database_name": request.database_name,
            "collection_name": request.collection_name,
            "dropped_fields_count": field_count,
            "warning": "This action is irreversible. All data in the collection has been permanently deleted."
        }
        
    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error dropping collection: {str(e)}")

@app.delete("/milvus/drop_collection/{database_name}/{collection_name}")
async def drop_milvus_collection_path(database_name: str, collection_name: str):
    """
    Drop (delete) a Milvus collection using path parameters.
    
    Args:
        database_name: Name of the database
        collection_name: Name of the collection to drop
    
    Returns:
        dict: Success message confirming collection deletion
    """
    try:
        # Create request object and call the main drop function
        request = MilvusDropCollectionRequest(
            database_name=database_name,
            collection_name=collection_name
        )
        return await drop_milvus_collection(request)
        
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error dropping collection: {str(e)}")

@app.post("/milvus/load_collection/")
async def load_milvus_collection(request: MilvusCollectionRequest):
    """
    Load a Milvus collection into memory and create index if needed.
    """
    try:
        # Establish connection
        if not get_milvus_connection():
            raise HTTPException(status_code=500, detail="Failed to connect to Milvus server")
        
        # Switch to the specified database
        db.using_database(request.database_name)
        
        # Check if collection exists
        if not utility.has_collection(request.collection_name):
            raise HTTPException(status_code=404, detail=f"Collection '{request.collection_name}' not found")
        
        # Get the collection
        collection = Collection(request.collection_name)
        
        # Create index if it doesn't exist
        try:
            index_info = collection.index()
            if not index_info:
                print(f"[INFO] Creating index for collection '{request.collection_name}'...")
                index_params = {
                    "metric_type": "IP",
                    "index_type": "FLAT", 
                    "params": {}
                }
                collection.create_index(field_name="embedding", index_params=index_params)
                print("[INFO] Index created successfully")
            else:
                print("[INFO] Index already exists")
        except Exception as e:
            print(f"[WARN] Index handling: {e}")
        
        # Load the collection
        collection.load()
        
        # Wait for loading to complete
        import time
        time.sleep(2)
        
        return {
            "status": "success",
            "message": f"Collection '{request.collection_name}' loaded successfully",
            "database_name": request.database_name,
            "collection_name": request.collection_name,
            "num_entities": collection.num_entities
        }
        
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error loading collection: {str(e)}")

@app.get("/milvus/test_search/{database_name}/{collection_name}")
async def test_milvus_search(database_name: str, collection_name: str):
    """
    Test endpoint to debug Milvus search issues.
    """
    try:
        # Connect to Milvus
        connections.connect(
            alias="default",
            host=MILVUS_HOST,
            port=MILVUS_PORT
        )
        
        # Switch to database
        db.using_database(database_name)
        
        # Get collection
        collection = Collection(collection_name)
        
        # Check collection status
        status_info = {
            "collection_name": collection_name,
            "num_entities": collection.num_entities,
            "schema": str(collection.schema),
        }
        
        # Try to load
        try:
            collection.load()
            status_info["load_status"] = "success"
        except Exception as e:
            status_info["load_status"] = f"failed: {str(e)}"
        
        # Try a simple search
        try:
            # Create a dummy embedding (1536 dimensions)
            dummy_embedding = [0.1] * 1536
            
            search_results = collection.search(
                data=[dummy_embedding],
                anns_field="embedding",
                param={"metric_type": "IP", "params": {}},
                limit=1,
                output_fields=["candidate_id"]
            )
            
            status_info["search_test"] = "success"
            status_info["search_results_count"] = len(search_results[0]) if search_results and search_results[0] else 0
            
        except Exception as e:
            status_info["search_test"] = f"failed: {str(e)}"
        
        return status_info
        
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Test failed: {str(e)}")

@app.post("/milvus/sync_data/")
async def sync_data_to_milvus(request: MilvusSyncRequest):
    """
    Synchronize data from database to Milvus collections, similar to Faiss startup sync.
    
    This endpoint:
    1. Fetches all candidate and job data from the database
    2. Processes each field separately according to the Milvus schema
    3. Generates embeddings for the combined text (like Faiss)
    4. Stores structured data + embeddings in Milvus collections
    
    Args:
        request: MilvusSyncRequest containing sync configuration
    
    Returns:
        dict: Sync results with counts and statistics
    """
    try:
        # Establish connection
        if not get_milvus_connection():
            raise HTTPException(status_code=500, detail="Failed to connect to Milvus server")
        
        # Switch to the specified database
        db.using_database(request.database_name)
        
        sync_results = {
            "status": "success",
            "database_name": request.database_name,
            "candidates_synced": 0,
            "jobs_synced": 0,
            "candidates_errors": 0,
            "jobs_errors": 0,
            "total_time": 0,
            "details": []
        }
        
        start_time = time.time()
        
        # Sync Candidates
        if request.sync_candidates:
            try:
                candidate_result = await sync_candidates_to_milvus(
                    request.database_name,
                    request.candidate_collection_name,
                    request.batch_size,
                    request.force_recreate
                )
                sync_results["candidates_synced"] = candidate_result["total_synced"]
                sync_results["candidates_errors"] = candidate_result["errors"]
                sync_results["details"].append(candidate_result)
                
            except Exception as e:
                sync_results["details"].append({
                    "collection_type": "candidates",
                    "status": "error",
                    "error": str(e)
                })
                sync_results["candidates_errors"] = -1
        
        # Sync Jobs
        if request.sync_jobs:
            try:
                job_result = await sync_jobs_to_milvus(
                    request.database_name,
                    request.job_collection_name,
                    request.batch_size,
                    request.force_recreate
                )
                sync_results["jobs_synced"] = job_result["total_synced"]
                sync_results["jobs_errors"] = job_result["errors"]
                sync_results["details"].append(job_result)
                
            except Exception as e:
                sync_results["details"].append({
                    "collection_type": "jobs",
                    "status": "error",
                    "error": str(e)
                })
                sync_results["jobs_errors"] = -1
        
        sync_results["total_time"] = round(time.time() - start_time, 2)
        
        return sync_results
        
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error syncing data to Milvus: {str(e)}")

async def sync_candidates_to_milvus(database_name: str, collection_name: str, batch_size: int, force_recreate: bool):
    """
    Sync candidate data to Milvus collection with structured fields.
    """
    conn = get_db_connection()
    
    try:
        # Check if collection exists
        if utility.has_collection(collection_name):
            if force_recreate:
                collection = Collection(collection_name)
                collection.drop()
                print(f"[INFO] Dropped existing collection '{collection_name}' for recreation")
            else:
                print(f"[INFO] Collection '{collection_name}' already exists, skipping recreation")
                return {
                    "collection_type": "candidates",
                    "collection_name": collection_name,
                    "status": "skipped",
                    "message": "Collection already exists, use force_recreate=true to recreate",
                    "total_synced": 0,
                    "errors": 0
                }
        
        # Create collection with candidate schema if it doesn't exist
        if not utility.has_collection(collection_name):
            # Create candidate schema
            candidate_fields = [
                FieldSchema(name="candidate_id", dtype=DataType.INT64, is_primary=True, auto_id=False),
                FieldSchema(name="headline", dtype=DataType.VARCHAR, max_length=5000),
                FieldSchema(name="about", dtype=DataType.VARCHAR, max_length=16000),
                FieldSchema(name="sectors", dtype=DataType.VARCHAR, max_length=3000),
                FieldSchema(name="keywords", dtype=DataType.VARCHAR, max_length=16000),
                FieldSchema(name="skills", dtype=DataType.VARCHAR, max_length=16000),
                FieldSchema(name="languages", dtype=DataType.VARCHAR, max_length=1000),
                FieldSchema(name="projects", dtype=DataType.VARCHAR, max_length=16000),
                FieldSchema(name="certifications", dtype=DataType.VARCHAR, max_length=16000),
                FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=1536)
            ]
            
            
            schema = CollectionSchema(
                fields=candidate_fields,
                description=f"Candidate data synchronized from database with embeddings"
            )
            
            collection = Collection(name=collection_name, schema=schema)
            print(f"[INFO] Created collection '{collection_name}' with candidate schema")
        else:
            collection = Collection(collection_name)
        
        # Create index for the embedding field if not exists
        try:
            index_info = collection.index()
            if not index_info:
                print(f"[INFO] Creating FLAT index for collection '{collection_name}'...")
                index_params = {
                    "metric_type": "IP",
                    "index_type": "IVF_FLAT",
                    "params": {
                        "nlist": 1024
                    }
                }
                collection.create_index(field_name="embedding", index_params=index_params)
                print("[INFO] Index created successfully")
        except Exception as e:
            print(f"[WARN] Index creation: {e}")
        
        # Fetch candidate data using the same query as Faiss (get_cleaned_candidate_text)
        query = """
        SELECT 
            cd.candidate_id,
            u.headline,
            u.about,
            GROUP_CONCAT(DISTINCT s.sector_name) AS sectors,
            GROUP_CONCAT(DISTINCT k.title) AS keywords,
            GROUP_CONCAT(DISTINCT sk.title) AS skills,
            GROUP_CONCAT(DISTINCT l.title) AS languages,
            GROUP_CONCAT(DISTINCT CONCAT_WS('||', up.title, up.description, up.technologies) SEPARATOR '%%') AS projects,
            GROUP_CONCAT(DISTINCT CONCAT_WS('||', uc.name, uc.authority) SEPARATOR '%%') AS certifications
        FROM 
            candidate_details cd
        LEFT JOIN users u ON cd.candidate_id = u.id
        LEFT JOIN user_sectors us ON cd.candidate_id = us.user_id
        LEFT JOIN sectors s ON us.sector_id = s.id
        LEFT JOIN user_keywords uk ON cd.candidate_id = uk.user_id
        LEFT JOIN keywords k ON uk.keyword_id = k.id
        LEFT JOIN user_skills usk ON cd.candidate_id = usk.user_id
        LEFT JOIN skills sk ON usk.skill_id = sk.id
        LEFT JOIN user_languages ul ON cd.candidate_id = ul.user_id
        LEFT JOIN languages l ON ul.language_id = l.id
        LEFT JOIN user_projects up ON cd.candidate_id = up.user_id
        LEFT JOIN user_certifications uc ON cd.candidate_id = uc.user_id
        GROUP BY 
            cd.candidate_id, u.headline, u.about
        """
        
        df = pd.read_sql_query(query, conn)
        
        if df.empty:
            return {
                "collection_type": "candidates",
                "collection_name": collection_name,
                "status": "completed",
                "message": "No candidate data found to sync",
                "total_synced": 0,
                "errors": 0
            }
        
        total_candidates = len(df)
        synced_count = 0
        error_count = 0
        
        print(f"[INFO] Starting sync of {total_candidates} candidates to Milvus")
        
        # Process in batches
        for start_idx in range(0, total_candidates, batch_size):
            end_idx = min(start_idx + batch_size, total_candidates)
            batch_df = df.iloc[start_idx:end_idx]
            
            batch_data = []
            batch_texts = []
            
            for _, row in batch_df.iterrows():
                try:
                    # Prepare structured data for Milvus (each field separately)
                    candidate_data = {
                        "candidate_id": int(row['candidate_id']),
                        "headline": str(row['headline'] or ''),
                        "about": str(row['about'] or ''),
                        "sectors": str(row['sectors'] or ''),
                        "keywords": str(row['keywords'] or ''),
                        "skills": str(row['skills'] or ''),
                        "languages": str(row['languages'] or ''),
                        "projects": str(row['projects'] or ''),
                        "certifications": str(row['certifications'] or '')
                    }
                    # Create combined text for embedding (same as Faiss approach)
                    text_parts = [str(row[col]) for col in df.columns if col != 'candidate_id' and pd.notnull(row[col])]
                    combined_text = " ".join(text_parts)
                    cleaned_text = clean_resume_text(combined_text)
                    
                    batch_data.append(candidate_data)
                    batch_texts.append(cleaned_text)
                    
                except Exception as e:
                    print(f"[ERROR] Error processing candidate {row['candidate_id']}: {e}")
                    error_count += 1
                    continue
            
            if batch_data:
                try:
                    # Generate embeddings for the batch
                    batch_embeddings = embeddings.embed_documents(batch_texts)
                    
                    # Add embeddings to the data
                    for i, embedding in enumerate(batch_embeddings):
                        batch_data[i]["embedding"] = embedding
                    
                    # Insert into Milvus
                    collection.insert(batch_data)
                    collection.flush()
                    
                    synced_count += len(batch_data)
                    print(f"[INFO] Synced batch {start_idx//batch_size + 1}: {len(batch_data)} candidates")
                    
                except Exception as e:
                    print(f"[ERROR] Error inserting batch to Milvus: {e}")
                    error_count += len(batch_data)
        
        # Create index and load the collection after all data is inserted
        try:
            # Create index for searching
            index_params = {
                "metric_type": "IP",
                "index_type": "FLAT",
                "params": {}
            }
            collection.create_index(field_name="embedding", index_params=index_params)
            print(f"[INFO] Index created for collection '{collection_name}'")
            
            # Load the collection
            collection.load()
            print(f"[INFO] Collection '{collection_name}' loaded into memory")
            
        except Exception as e:
            print(f"[INFO] Index/Load info: {e}")  # This might not be an error if index already exists
        
        return {
            "collection_type": "candidates",
            "collection_name": collection_name,
            "status": "completed",
            "total_candidates": total_candidates,
            "total_synced": synced_count,
            "errors": error_count,
            "batches_processed": (total_candidates + batch_size - 1) // batch_size,
            "message": f"Successfully synced {synced_count} candidates with {error_count} errors"
        }
        
    finally:
        conn.close()

async def sync_jobs_to_milvus(database_name: str, collection_name: str, batch_size: int, force_recreate: bool):
    """
    Sync job data to Milvus collection with structured fields.
    """
    conn = get_db_connection()
    
    try:
        # Check if collection exists
        if utility.has_collection(collection_name):
            if force_recreate:
                collection = Collection(collection_name)
                collection.drop()
                print(f"[INFO] Dropped existing collection '{collection_name}' for recreation")
            else:
                print(f"[INFO] Collection '{collection_name}' already exists, skipping recreation")
                return {
                    "collection_type": "jobs",
                    "collection_name": collection_name,
                    "status": "skipped",
                    "message": "Collection already exists, use force_recreate=true to recreate",
                    "total_synced": 0,
                    "errors": 0
                }
        
        # Create collection with job schema if it doesn't exist
        if not utility.has_collection(collection_name):
            # Create job schema
            job_fields = [
                FieldSchema(name="job_id", dtype=DataType.INT64, is_primary=True, auto_id=False),
                FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=5000),
                FieldSchema(name="description", dtype=DataType.VARCHAR, max_length=16000),
                FieldSchema(name="skills", dtype=DataType.VARCHAR, max_length=16000),
                FieldSchema(name="keywords", dtype=DataType.VARCHAR, max_length=10000),
                FieldSchema(name="languages", dtype=DataType.VARCHAR, max_length=1000),
                FieldSchema(name="sectors", dtype=DataType.VARCHAR, max_length=3000),
                FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=1536)
            ]
            
            schema = CollectionSchema(
                fields=job_fields,
                description=f"Job data synchronized from database with embeddings"
            )
            
            collection = Collection(name=collection_name, schema=schema)
            print(f"[INFO] Created collection '{collection_name}' with job schema")
        else:
            collection = Collection(collection_name)
        
        # Fetch job data using the same query structure as create_job_vectorstore
        query = """
        SELECT 
            a.id AS job_id,
            a.title AS title,
            a.text_description AS description,
            GROUP_CONCAT(DISTINCT s.title) AS skills,
            GROUP_CONCAT(DISTINCT k.title) AS keywords,
            GROUP_CONCAT(DISTINCT l.title) AS languages,
            GROUP_CONCAT(DISTINCT sec.sector_name) AS sectors
        FROM 
            assignments AS a
        LEFT JOIN assignment_skills AS ask ON a.id = ask.assignment_id
        LEFT JOIN skills AS s ON ask.skill_id = s.id
        LEFT JOIN assignment_keywords AS ak ON a.id = ak.assignment_id
        LEFT JOIN keywords AS k ON ak.keyword_id = k.id
        LEFT JOIN assignment_languages AS al ON a.id = al.assignment_id
        LEFT JOIN languages AS l ON al.language_id = l.id
        LEFT JOIN assignment_sectors AS asec ON a.id = asec.assignment_id
        LEFT JOIN sectors AS sec ON asec.sector_id = sec.id
        WHERE a.status = 'active'
        GROUP BY 
            a.id, a.title, a.text_description
        """
        
        df = pd.read_sql_query(query, conn)
        
        if df.empty:
            return {
                "collection_type": "jobs",
                "collection_name": collection_name,
                "status": "completed",
                "message": "No job data found to sync",
                "total_synced": 0,
                "errors": 0
            }
        
        total_jobs = len(df)
        synced_count = 0
        error_count = 0
        
        print(f"[INFO] Starting sync of {total_jobs} jobs to Milvus")
        
        # Process in batches
        for start_idx in range(0, total_jobs, batch_size):
            end_idx = min(start_idx + batch_size, total_jobs)
            batch_df = df.iloc[start_idx:end_idx]
            
            batch_data = []
            batch_texts = []
            
            for _, row in batch_df.iterrows():
                try:
                    # Prepare structured data for Milvus (each field separately)
                    job_data = {
                        "job_id": int(row['job_id']),
                        "title": str(row['title'] or ''),
                        "description": str(row['description'] or ''),
                        "skills": str(row['skills'] or ''),
                        "keywords": str(row['keywords'] or ''),
                        "languages": str(row['languages'] or ''),
                        "sectors": str(row['sectors'] or '')
                    }
                    
                    # Create combined text for embedding (same as Faiss approach)
                    title = str(row['title'] or '')
                    description = str(row['description'] or '')
                    skills = str(row['skills'] or '')
                    languages = str(row['languages'] or '')
                    keywords = str(row['keywords'] or '')
                    sectors = str(row['sectors'] or '')
                    
                    combined_text = f"{title.strip()} - {description.strip()} - {skills.strip()} - {languages.strip()} - {keywords.strip()} - {sectors.strip()}"
                    
                    batch_data.append(job_data)
                    batch_texts.append(combined_text)
                    
                except Exception as e:
                    print(f"[ERROR] Error processing job {row['job_id']}: {e}")
                    error_count += 1
                    continue
            
            if batch_data:
                try:
                    # Generate embeddings for the batch
                    batch_embeddings = embeddings.embed_documents(batch_texts)
                    
                    # Add embeddings to the data
                    for i, embedding in enumerate(batch_embeddings):
                        batch_data[i]["embedding"] = embedding
                    
                    # Insert into Milvus
                    collection.insert(batch_data)
                    collection.flush()
                    
                    synced_count += len(batch_data)
                    print(f"[INFO] Synced batch {start_idx//batch_size + 1}: {len(batch_data)} jobs")
                    
                except Exception as e:
                    print(f"[ERROR] Error inserting batch to Milvus: {e}")
                    error_count += len(batch_data)
        
        return {
            "collection_type": "jobs",
            "collection_name": collection_name,
            "status": "completed",
            "total_jobs": total_jobs,
            "total_synced": synced_count,
            "errors": error_count,
            "batches_processed": (total_jobs + batch_size - 1) // batch_size,
            "message": f"Successfully synced {synced_count} jobs with {error_count} errors"
        }
        
    finally:
        conn.close()



import os
import sqlite3

DB_FILE = "recommendations.db"


@app.post("/create/sqlite/db")
def create_sqlite_db():
    try:
        conn = sqlite3.connect(DB_FILE)
        cursor = conn.cursor()
        cursor.execute("""
                    CREATE TABLE IF NOT EXISTS recommendations (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            job_id INTEGER NOT NULL,
            candidate_id INTEGER NOT NULL,
            llm_response TEXT NOT NULL CHECK (json_valid(llm_response)),
            UNIQUE(job_id, candidate_id)
        );
                """)
        conn.commit()
        conn.close()
        return {"status": "success", "message": "SQLite database initialized."}
    except Exception as e:
        return {"status": "error", "message": str(e)}



@app.delete("/remove/sqlite/db")
def remove_sqlite_db():
    try:
        if os.path.exists(DB_FILE):
            os.remove(DB_FILE)
            return {"status": "success", "message": "SQLite database removed."}
        else:
            return {"status": "not_found", "message": "Database file does not exist."}
    except Exception as e:
        return {"status": "error", "message": str(e)}



if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host=API_HOST, port=int(API_PORT))


