import requests
from requests.exceptions import Timeout, RequestException
from bs4 import BeautifulSoup, Comment
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from urllib.parse import urlparse, urljoin, urlsplit, parse_qs, urlencode
from urllib import robotparser
import os
import time
from dotenv import load_dotenv
from youtube_transcript_api import YouTubeTranscriptApi
from starlette.concurrency import run_in_threadpool
from datetime import datetime
from textwrap import wrap
from googleapiclient.discovery import build
from app.api.v1.libraries.task_status_manager import store_document_data

#import database
from .extract import extract_content
from .googledrive import download_files 
from .chatgptasync import get_response_gpt

load_dotenv()
# Constants
MAX_DEPTH = 25
MAX_URLS = 400
MAX_FAILURES = 10
MAX_RETRIES = 3
current_urls = 0  # Initialize current_urls
robot_parsers = {}
CRAWLER_COLLECTION = 'urls'
YOUTUBE_KEY = os.getenv('YOUTUBE_KEY', '')

def can_fetch_url(target_url, user_agent="*"):
    base_url = urlparse(target_url).scheme + "://" + urlparse(target_url).netloc

    if base_url not in robot_parsers:
        rp = robotparser.RobotFileParser()
        try:
            rp.set_url(urljoin(base_url, "robots.txt"))
            rp.read()
            robot_parsers[base_url] = rp
        except:
            return True
    else:
        rp = robot_parsers[base_url]

    return rp.can_fetch(user_agent, target_url)

def is_valid(url, base_url):
    parsed_base = urlparse(base_url)
    parsed_url = urlparse(url)
    return parsed_url.scheme in ['http', 'https'] and parsed_base.netloc == parsed_url.netloc

def clean_text(text):
    return text.replace("\n", " ").replace("\r", " ").replace("\t", " ").replace("\xa0", " ")


#crawl_website(form_main_url, project_id, account_id, data_dir, db)

#Crawler for Websites information - Its a recursive crawler, it works until the URLS are getting completed
async def crawl_website(url, project_id, account_id, folder_name, content_id, depth=0, visited=None, to_visit=None):
    #crawler_urls = db[CRAWLER_COLLECTION]
    #global current_urls  # Using the global variable to track the count
    current_urls = 0
    consecutive_failures = 0  # Track consecutive failures
    is_react = False

    print(" crawler -> ", url)

    if visited is None:
        visited = set()

    if depth == 0:
        depth = MAX_DEPTH
    
    if to_visit is None:
        to_visit = [(url, depth)]
    
    if not url.startswith(('http://', 'https://')):
        try:
            response = requests.head('https://' + url, timeout=5)
            if response.status_code < 400:
                url = 'https://' + url
            else:
                url = 'http://' + url
        except requests.RequestException:
            url = 'http://' + url

    #folder_name = f"./data/{account_id}/{project_id}/sourcefiles/"
    os.makedirs(folder_name, exist_ok=True)

    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299',
        'Referer': url
    }
    stream = False

    response = safe_request(url, headers, stream)

    if response:
        soup = BeautifulSoup(response.content, 'html.parser')
        is_react = check_for_react(soup)  # Use the function to check for React


    while to_visit and current_urls < MAX_URLS:

        try:
            url, current_depth = to_visit.pop()
            #print(url, ":", current_urls)
            normalized_url = normalize_url(url)
            is_download = is_downloadable_file(url)
            stream = False
            if is_download:
                stream = True

            if normalized_url in visited or current_depth < 0:
                continue
            
            response = None

            allowed_mime_types = [
                'text/html',  # HTML files
                'application/pdf',  # PDF files
                'application/msword',  # DOC files
                'application/vnd.openxmlformats-officedocument.wordprocessingml.document',  # DOCX files
                'application/vnd.ms-powerpoint',  # PPT files
                'application/vnd.openxmlformats-officedocument.presentationml.presentation',  # PPTX files
                'text/plain'  # TXT files
            ]
            max_allowed_size = 20 * 1024 * 1024  # 10 MB

            # Check if the URL points to a downloadable file
            if is_download:
                # Define allowed MIME types
                pre_check_response = safe_request(url, headers, stream=False, method='HEAD')

                if pre_check_response:
                    content_type = pre_check_response.headers.get('Content-Type', '')
                    content_length = pre_check_response.headers.get('Content-Length', 0)  # Get the Content-Length header
                    try:
                        content_length = int(content_length)
                    except ValueError:
                        content_length = 0

                    #print("content_type :: ", content_type, "content_length :: ", content_length, "max_allowed_size :: ", max_allowed_size, "allowed_mime_types :: ", allowed_mime_types)

                    if any(allowed_type in content_type for allowed_type in allowed_mime_types) and content_length <= max_allowed_size:

                        response = safe_request(url, headers, stream=True)
                        if response and response.status_code == 200:
                        
                            file_name = urlsplit(url).path.split('/')[-1]
                            file_path = os.path.join(folder_name, file_name)
                            try:
                                with open(file_path, 'wb') as out_file:
                                    for chunk in response.iter_content(chunk_size=32768):
                                        if chunk:  # filter out keep-alive new chunks
                                            out_file.write(chunk)
                                #print(f"Downloaded file saved to: {file_path}")

                                # Create and insert document into MongoDB
                                title = os.path.splitext(file_name)[0].replace('-', ' ').replace('_', ' ')
                                document = create_document(normalized_url, project_id, title, "", "", file_name)
                                store_document_data(content_id, document)
                                #crawler_urls.insert_one(document)
                            except Exception as e:
                                print(f"Error while saving the file {file_name}: {e}")
                        
                        #file size & type
                        else:
                            print(f"File skipped: Type or size not allowed. Type: {content_type}, Size: {content_length} bytes")

                    #response check ends here
                    else:
                        print(f"File does not meet download criteria. Type: {content_type}, Size: {content_length} bytes")
                
                #header check ends here
                else:
                    print("Failed to fetch file headers.")
                
                visited.add(normalized_url)

            else:
                #print("URL - ", url)
                html_content = None
                dynamic_title = None
                dynamic_meta_description = None
                #is_react = True
                if is_react:
                    # Fetch the page using Selenium and get the HTML content
                    html_content, dynamic_title, dynamic_meta_description = await fetch_page_with_selenium(url)
                    if not html_content:  # Check if fetching was unsuccessful
                        print(f"Failed to fetch {url} with Selenium.")
                        visited.add(normalized_url)
                        continue
                else:
                    # Fetch the page using requests
                    response = safe_request(url, headers, stream)
                    if response and response.status_code == 200:
                        html_content = response.content
                    else:
                        print(f"Failed to fetch {url} with requests. Status code: {response.status_code if response else 'N/A'}")
                        visited.add(normalized_url)
                        continue

                #filename = os.path.join(folder_name, url.split('//')[1].replace('/', '_').rstrip('_') + '.txt')
                filename = os.path.join(folder_name, url.split('//')[1].replace('/', '~slash~').rstrip('~slash~') + '.txt')

                if is_react:
                    soup = BeautifulSoup(html_content, 'html.parser')
                else:
                    soup = BeautifulSoup(html_content, 'html.parser')

                title, meta_description, keywords, content, youtube_embeds = extract_content(soup, url)

                if dynamic_title:
                    title = dynamic_title
                
                if dynamic_meta_description:
                    meta_description = dynamic_meta_description

                #print(title, meta_description, keywords , youtube_embeds)

                if youtube_embeds:
                    await process_youtube_embeds(youtube_embeds, project_id, account_id, folder_name)

                document = create_document(normalized_url, project_id, title, meta_description, keywords, filename)
                store_document_data(content_id, document)
                #crawler_urls.insert_one(document)

                #This is for formating content using GPT API ...
                formatted_content = content #await format_content_with_gpt(content)

                with open(filename, 'w', encoding='utf-8') as file:
                    file.write(formatted_content)
            
                extracted_urls = extract_links(soup, url)
                #print(extracted_urls)
                for extracted_url in extracted_urls:
                    normalized_joined_url = normalize_url(extracted_url)
                    if normalized_joined_url not in visited:
                        to_visit.append((normalized_joined_url, current_depth-1))
                
            #else ends here 
                        
            consecutive_failures = 0
            current_urls += 1
            visited.add(normalized_url)
        except Exception as e:
            print(f"Error in LOOP : {e}")
            continue  # Optionally handle or log the error and continue with the next URL

    # Ensure the loop ends when all URLs are visited or max URLs limit is reached
    if current_urls >= MAX_URLS:
        print("Reached maximum URL limit.")
    elif not to_visit:
        print("No more URLs to visit.")

def normalize_url(url):
    parsed = urlparse(url)
    # Filter query parameters to include only those that affect content
    relevant_params = {key: value for key, value in parse_qs(parsed.query).items() if key in ['option', 'view', 'id', 'Itemid', 'lang']}
    # Re-encode the filtered query parameters
    normalized_query = urlencode(relevant_params, doseq=True)
    # Construct the normalized URL with relevant query parameters
    return f"{parsed.scheme}://{parsed.netloc}{parsed.path}?{normalized_query}".rstrip('?')


def normalize_url_old(url):
    parsed = urlparse(url)
    return parsed.scheme + "://" + parsed.netloc + parsed.path.rstrip('/')

def extract_links(soup, base_url):
    extracted_links = []
    for a_tag in soup.find_all('a', href=True):
        href = a_tag.get('href')
        # Normalize and resolve relative URLs
        joined_url = urljoin(base_url, href)
        if is_valid(joined_url, base_url):
            extracted_links.append(joined_url)
    return extracted_links

#sending requests to any given URL (html based websites only)
def safe_request(url, headers, stream=False, method='GET', max_retries=MAX_RETRIES):
    retries = 0
    while retries < max_retries:
        try:
            #response = requests.get(url, headers=headers, stream=stream, timeout=(10, 30))
            if method.upper() == 'HEAD':
                response = requests.head(url, headers=headers, timeout=(10, 30))
            else:  # Default to GET
                response = requests.get(url, headers=headers, stream=stream, timeout=(10, 30))

            if response.status_code == 200:
                if stream and method.upper() == 'GET':
                    # Handle streaming for large files
                    file_size = int(response.headers.get('content-length', 0))
                    max_size = 20 * 1024 * 1024  # 20 MB
                    if file_size <= max_size:
                        return response
                    else:
                        print(f"File too large to download: {url}")
                        return None
                    
                return response
        except Timeout:
            print(f"Timeout occurred for {url}. Retrying ({retries+1}/{max_retries})...")
        except RequestException as e:
            print(f"Request failed: {e}")
            break
        retries += 1
    return None


async def fetch_page_with_selenium(url):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    try:
        driver.get(url)
        WebDriverWait(driver, 10).until(
            lambda d: d.execute_script("return document.readyState === 'complete'")
        )
        time.sleep(5)
        # Corrected JavaScript execution for fetching title and meta description
        dynamic_title = driver.execute_script("return document.title;")

        dynamic_meta_descriptions = driver.execute_script("""
        var metas = document.querySelectorAll('meta[name="description"]');
        var descriptions = [];
        for (var i = 0; i < metas.length; i++) {
            descriptions.push(metas[i].getAttribute('content'));
        }
        return descriptions;
        """)

        if dynamic_meta_descriptions:
            dynamic_meta_description = dynamic_meta_descriptions[-1]  # This selects the last item
        else:
            dynamic_meta_description = None
        
        page_source = driver.page_source
        #print(dynamic_meta_description, " :::: ", dynamic_title)
        return page_source, dynamic_title, dynamic_meta_description
    except Exception as e:
        print(f"Error fetching page with Selenium: {e}")
        return None, None, None
    finally:
        driver.quit()


#check if this website is react or angular
def check_for_react(soup):
    noscript_tags = soup.find_all('noscript')
    for noscript_tag in noscript_tags:
        if 'You need to enable JavaScript to run this app.' in noscript_tag.text:
            return True
    return False

#detects any JS frameworks
def detect_js_framework(soup):
    # React detection using unique React attributes
    if soup.find_all(attrs={"data-reactroot": True}) or \
       soup.find_all(attrs={"data-reactid": True}) or \
       any('__reactInternal' in str(tag) for tag in soup.find_all()):
        return True
    
    # Angular detection using ng-version and common Angular attributes
    if soup.find_all(attrs={"ng-version": True}) or \
       soup.find_all(lambda tag: tag.name and any(attr for attr in tag.attrs if attr.startswith('ng-'))):
        return True
    
    # Vue detection using Vue comments or common id used for Vue mounting point
    if soup.find_all(string=lambda text: isinstance(text, Comment) and 'vue-template' in text) or \
       soup.find(id='app'):
        return True
    
    return False

async def process_youtube_embeds(embed_urls, project_id, account_id, folder_name):
    # Assuming extract_youtube_embeds has been called and youtube_embeds is populated
    for embed_url in embed_urls:
        await crawl_youtube_transcript(embed_url, project_id, account_id, folder_name)


# Function to extract YouTube transcript
async def crawl_youtube_transcript(url, project_id, account_id, folder_name):

    #parts = url.split("v=")
    #if len(parts) < 2:
    #    return f"Couldn't find video_id {url}"

    #video_id = parts[1].split("&")[0]

    video_id = None
    if "embed/" in url:
        parts = url.split("embed/")
        video_id = parts[1] if len(parts) > 1 else None
    elif "watch?v=" in url:
        parts = url.split("watch?v=")
        video_id = parts[1] if len(parts) > 1 else None
    
    if not video_id:
        return f"Couldn't find video_id in {url}"

    youtube = build('youtube', 'v3', developerKey=YOUTUBE_KEY)
    title = description = cleaned_text = None
    filename = f"{video_id}_transcript.txt"

    try:
        request = youtube.videos().list(
            part="snippet",
            id=video_id
        )
        response = request.execute()
        if response['items']:
            video_item = response['items'][0]['snippet']
            title = video_item['title']
            description = video_item['description']
    except Exception as e:
        print("ERROR happened", e)
        pass

    try:
        srt = YouTubeTranscriptApi.get_transcript(video_id)

        os.makedirs(folder_name, exist_ok=True)
        text_entries = [entry['text'] for entry in srt]

        formatted_text = "\n".join(text_entries)
        try:
            cleaned_text = await get_clean_document_davinci(formatted_text)
        except:
            cleaned_text = formatted_text

        cleaned_text = f'{cleaned_text}'

    except Exception as e:
        print("ERROR happened", e)
        pass

    content = "YouTube Video :\n"
    if title:
        content += f"Title: {title}\n"
    if description:
        content += f"Description: {description}\n"
    if cleaned_text:
        content += f"\nTranscript:\n{cleaned_text}"


    file_path = os.path.join(folder_name, filename)
    with open(file_path, "w") as f:
        f.write(content)

    return filename
    
# Function to extract Googel Drive
async def crawl_googledrive(url, folder_name,authCode):
    await download_files(url, folder_name, authCode)
    return True

# Function to extract Dropbox Drive
def crawl_dropbox(url, folder_name):
    return True


async def get_clean_document_davinci(text):
    """
    This function can be generalized to accommodate different chatbot implementations.
    For now, it just uses OpenAI's API.
    """
    prompt_text = f" Please format this content properly -> Content: {text}\n"

    try:

        system_message = {"role": "system", "content": "This is youtube video transcript, please rewrite the content properly with keywords and return text summary"}
        user_message = {"role": "user", "content": prompt_text}

        #youtube_content = get_response_gpt(system_message, user_message, "", 500 )
        response = await run_in_threadpool(get_response_gpt, system_message, user_message, 'gpt-3.5-turbo', 1500)
        response_text = response['bot_response']
        #print("parsed youtube content :: ", response_text)
        
        return response_text
    
    except Exception as e:
        print(f"Error in getting response from Davinci: {e}")
        return text
    
def create_document(url, project_id, title, meta_description, keywords, filename):
    return {
        "url": url,
        "project_id": project_id,
        "title": title,
        "meta_description": meta_description,
        "keywords": keywords,
        "filename": filename,
        "updated_date": datetime.utcnow()
    }

def is_downloadable_file(url):
    # List of file extensions that are considered as downloadable files
    downloadable_extensions = ['.pdf', '.doc', '.xml', '.json', '.txt', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.mp4', '.mp3', '.avi', '.wav']
    # Return True if the URL ends with any of the extensions in the list
    return any(url.lower().endswith(ext) for ext in downloadable_extensions)

async def format_content_with_gpt(original_content, max_token_length=2000):
    """
    Formats content using GPT, handling content longer than max_token_length by splitting and merging.
    """
    # Function to split content into chunks
    def split_content(content, max_words=2000):
        """
        Splits content into chunks, each with up to max_words words.
        """
        words = content.split()
        chunks = [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)]
        return chunks

    # Split the content if it exceeds the max_token_length
    content_chunks = split_content(original_content, max_token_length)
    formatted_content_chunks = []

    for chunk in content_chunks:
        # Prepare the messages for GPT
        system_message = {"role": "system", "content": "Format the content for rewrite in detail without missing any crucial information such as products, pricing, address, emails or contacts. Also remove unwanted texts"}
        user_message = {"role": "user", "content": chunk}
        
        # Assuming get_response_gpt is an async function you've defined to interact with GPT
        response = await run_in_threadpool(get_response_gpt, system_message, user_message, 'gpt-3.5-turbo', 1400)
        
        # Extracting formatted content from response; adjust according to your response structure
        formatted_chunk = response["bot_response"]
        formatted_content_chunks.append(formatted_chunk)

    # Merge the formatted chunks
    formatted_content = " ".join(formatted_content_chunks)
    return formatted_content


