import os
import re
from dotenv import load_dotenv
from langchain_community.embeddings import SentenceTransformerEmbeddings, HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModel
from scipy.spatial.distance import cosine
from nltk.tokenize import word_tokenize
from fastapi import Request, Depends
from pymongo import DESCENDING
from pymongo.collection import Collection    
import torch

# chromadb.py
embeddings_model_name = os.environ.get("EMBEDDINGS_MODEL_NAME")
SOME_RELEVANCE_THRESHOLD = 0.6
URL_DETAILS = 'urls'




# Load environment variables and configuration
if not load_dotenv():
    print("Could not load .env file or it is empty. Please check if it exists and is readable.")
    exit(1)


class ChromaDB:
    def __init__(self, location: str):
        # Initialize the database connection
        self.location = location
        self.connection = self.connect_to_db()

    def connect_to_db(self):
        # Connect to ChromaDB at the specified location
        # For simplicity, I'm assuming it's a connection object. Replace with actual logic.
        connection = None  # Placeholder
        return connection

    def query(self, user_input: str):
        # Implement the logic to query ChromaDB using RAG based on the user_input
        # This should return relevant business content
        return "Sample business content based on user input."  # Placeholder

    def close(self):
        # Close the database connection
        pass  # Placeholder


#To get our DB instance - Multi tenant for every single customer & single project
def get_chromadb_location(account_id: str, project_id: str) -> str:
    return f"./data/{account_id}/{project_id}/chromadb/"

#Initialize ChromaDB with Embeddings
def initialize_chromadb(account_id: str, project_id: str):
    chromadb_location = get_chromadb_location(account_id, project_id)
    #embeddings = HuggingFaceEmbeddings(model_name=os.environ.get("EMBEDDINGS_MODEL_NAME"))
    embeddings = SentenceTransformerEmbeddings(model_name=os.environ.get("EMBEDDINGS_MODEL_NAME"))

    return Chroma(persist_directory=chromadb_location, embedding_function=embeddings)

#Initialize ChromaDB with out embeddings
def normal_initialize_hromadb(account_id: str, project_id: str) -> ChromaDB:
    """
    Initialize and return a ChromaDB instance for the given account and project.
    """
    chromadb_location = get_chromadb_location(account_id, project_id)
    db_instance = ChromaDB(chromadb_location)
    return db_instance


def query_business_content_from_vector_db(user_input: str, db_instance: ChromaDB):
    """
    Use the provided user_input to query the vector database (ChromaDB with RAG) 
    and retrieve relevant business content.
    """
    content = db_instance.query(user_input)
    return content

def preprocess_input(text):
    """
    Preprocess the input text by lowercasing, removing special characters, and handling synonyms.
    """
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    # Add any domain-specific synonym handling if necessary
    return text

def split_document_into_sections(text, max_section_length=512):
    """
    Split a long document into smaller sections based on sentence boundaries.
    Each section will have a maximum length of `max_section_length`.
    """
    sentences = sent_tokenize(text)
    sections = []
    current_section = ""

    for sentence in sentences:
        if len(current_section) + len(sentence) > max_section_length:
            sections.append(current_section)
            current_section = sentence
        else:
            current_section += " " + sentence

    if current_section:
        sections.append(current_section)
    
    return sections

def extract_doc_info(doc):
    """
    Extracts name, file_name, and content from a document.
    """
    name = os.path.basename(doc.file_name)
    file_name = doc.file_name
    content = doc.content
    return name, file_name, content

class ChromaDocument:
    def __init__(self, name, file_name, content):
        self.name = name
        self.file_name = file_name
        self.content = content

def retrieve_documents_from_chromadb(db, user_input_text):
    """
    Retrieve relevant documents from ChromaDB based on the user's input.
    """
    # Preprocess user input for better matching
    processed_input = preprocess_input(user_input_text)

    # Use ChromaDB search with the processed input
    retrieved_docs = db.search(processed_input, search_type='similarity', k=5)
    #print(retrieved_docs)
    #sorted_docs = sorted(retrieved_docs, key=lambda doc: doc.similarity_score, reverse=True)
    #most_relevant_doc = sorted_docs[0] if sorted_docs else None
    
    result = []
    for doc in retrieved_docs:
        # Extract content, file name, and additional processing as needed
        content = doc.page_content
        file_name = doc.metadata.get("source", "")
        name = os.path.basename(file_name)
        
        # Consider additional relevance checks or content processing here

        result.append(ChromaDocument(name, file_name, content))
    
    # Further refine or sort the results if necessary
    return result


def process_documents_from_chromadb(retrieved_docs):
    total_content = []
    links = set()
    youtube = set()
    docs = []
    root_url = ''
    retrieved_docs = sorted(retrieved_docs, key=lambda doc: doc.similarity_score, reverse=True)

    for doc in retrieved_docs:
        file_name = doc.file_name
        content = " ".join(doc.content.split())        
        total_content.append(content)
        
        #print(file_name)
        # Categorizing filenames
        if "_transcript.txt" in file_name:
            youtube_id = file_name.split("_transcript.txt")[0]
            youtubecode = youtube_id.rsplit("/", 1)[-1]
            youtube.append(youtubecode)

        elif ".txt" in file_name:
            # Read the content of the file
            with open(file_name, 'r') as f:
                content_lines = f.readlines()

            # Check for a line starting with "Title:"
            title_from_content = None
            for line in content_lines:
                if line.startswith("Title:"):
                    title_from_content = line.replace("Title:", "").strip()
                    break

            # Handle double "www."
            link = file_name
            if "www.www." in link:
                link = link.replace("www.www.", "www.")
            
            # Check if it's a website link and extract title
            link = link.split("/")[-1].replace("_", "/").replace(".txt", "")
            if title_from_content:
                title = title_from_content
            else:
                title = link.split("/")[-1].replace(".html", "").replace("-", " ").title()
            link_with_title = f'{link}:::{title}'
            links.add(link_with_title)
            
        else:
            doc_link = f"{root_url}/download/{file_name.split('/')[-1]}"
            docs.append(doc_link)

    total_content = " ".join(total_content).replace("\n", " ").strip()
    links = list(links)    
    youtube = list(youtube)    
    #print(links, youtube, docs)
    return total_content, links, youtube, docs

def process_retrieval_documents_from_chromadb(retrieved_docs, dbmongo):
    
    #url_collection = dbmongo.collection[URL_DETAILS]
    url_collection: Collection = dbmongo[URL_DETAILS]


    total_content = []
    links = set()
    youtube = set()
    docs_set = set()
    root_url = ''
    for doc in retrieved_docs:
        # Extracting page_content and source from the document
        content = " ".join(doc.page_content.split())
        source = doc.metadata.get('source', '')
        #print(source)
        #print(content)

        total_content.append(content)

        # Extract the file name from the source
        file_name = source.split('/')[-1]

        # Categorizing filenames
        if "_transcript.txt" in file_name:
            youtube_id = file_name.split("_transcript.txt")[0]
            youtubecode = youtube_id.rsplit("/", 1)[-1]
            youtube.add(youtubecode)

        elif ".txt" in file_name:
            try:
                #print(file_name)
                parts = file_name.split("?", 1)
                #OLD APPROACH
                #base_url_modified = parts[0].replace("_", "/").replace(".txt", "")
                #partial_link = base_url_modified if len(parts) == 1 else base_url_modified + "?" + parts[1]

                #NEW APPROACH
                base_url_modified = parts[0].replace("~slash~", "/").replace(".txt", "")
                partial_link = base_url_modified if len(parts) == 1 else base_url_modified + "?" + parts[1]

                #partial_link = file_name.replace("_", "/").replace(".txt", "")
                http_url = 'http://' + partial_link
                https_url = 'https://' + partial_link
                #print(https_url)
                db_doc_cursor = url_collection.find(
                    {
                        "$or": [
                            {"filename": {"$regex":file_name}}
                        ]
                    },
                    sort=[("updated_date", -1)]  # Sort by updated_date in descending order
                ).limit(1)  # Limit to only the first document
                
                db_doc = next(db_doc_cursor, None)  # Get the first document or None if empty
                #print(db_doc)
                if db_doc:
                    title = db_doc.get("title", "No Title")
                    meta_description = db_doc.get("meta_description", "")
                    updated_link = db_doc.get("url", "")
                    link_with_title_description = f'{updated_link}:::{title}:::{meta_description}'
                    links.add(link_with_title_description)
                else:
                    links.add(f'{partial_link}:::No Title:::')
            except Exception as e:
                print("ERROR", e)
            
        else:
            #doc_link = f"{root_url}/download/{file_name}"
            doc_link = source
            #print(doc_link)
            docs_set.add(doc_link)

    total_content = " ".join(total_content).replace("\n", " ").strip()
    links = list(links)
    youtube = list(youtube)
    docs = list(docs_set)

    return total_content, links, youtube, docs

def truncate_content(content, max_tokens):
    tokens = word_tokenize(content)
    #print(len(tokens))
    truncated_content = ' '.join(tokens[:max_tokens])
    return truncated_content, len(tokens)

def generate_query_embedding(text: str, model_name: str = "sentence-transformers/all-MiniLM-L6-v2") -> torch.Tensor:
    # Load tokenizer and model from Hugging Face Transformers
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    # Encode text to get tensor with tokens
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Generate embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        # Use the mean of the last hidden state
        embeddings = outputs.last_hidden_state.mean(dim=1)
        
    # Convert to numpy array and return
    return embeddings.cpu().numpy()

def rank_documents(documents: list) -> list:
    """
    Ranks documents based on a hypothetical 'relevance_score' attribute.
    The documents parameter is expected to be a list of dictionaries, where
    each dictionary represents a document and contains at least a 'relevance_score' key.
    
    :param documents: List of document dictionaries.
    :return: List of ranked document dictionaries.
    """
    # Sort the documents based on relevance_score in descending order
    ranked_documents = sorted(documents, key=lambda x: x.get('relevance_score', 0), reverse=True)
    return ranked_documents
