import re
from typing import Any, Iterable, Union, List
from bs4 import BeautifulSoup, Tag
from collections.abc import Callable
import requests
from urllib.parse import urlparse


def clean_text(text):
    text = text.replace("\n", " ").replace("\r", " ").replace("\t", " ").replace("\xa0", " ")
    return re.sub(' +', ' ', text)

def extract_elements(soup: BeautifulSoup, tags: Union[str, List[str]], attr: str = None, *, default: Union[str, bool] = "N/A") -> Union[str, List[str]]:
    if isinstance(tags, str):
        tags = [tags]  # Convert single tag to list

    elements = []
    for tag in tags:
        elements.extend(soup.find_all(tag))

    values = [clean_text(elem.get_text(strip=True)) for elem in elements]
    return values if values else default

def extract_title(soup: BeautifulSoup, default: str = "N/A") -> str:
    title_tag = soup.find('title')
    if title_tag:
        return clean_text(title_tag.get_text(strip=True))
    else:
        return default

def extract_meta_description(soup: BeautifulSoup) -> str:
    meta_description = soup.find('meta', attrs={'name': 'description'})
    if meta_description and meta_description.get('content'):
        return meta_description.get('content')
    else:
        return "N/A"

def extract_keywords(soup: BeautifulSoup) -> str:
    meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
    if meta_keywords and meta_keywords.get('content'):
        return meta_keywords.get('content')
    else:
        return "N/A"
    

def extract_attribute(soup: BeautifulSoup, tag: str, attribute: str, default: Union[str, bool] = "N/A") -> Union[str, bool]:
    element = soup.find(tag)
    return getattr(element, attribute, default) if element else default

def remove_unwanted_tags(soup: BeautifulSoup, tags: Iterable[str]) -> None:
    for tag in tags:
        for elem in soup(tag):
            elem.extract()

def extract_emails(soup: BeautifulSoup) -> List[str]:
    EMAIL_REGEX = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
    raw_text = soup.get_text()
    return re.findall(EMAIL_REGEX, raw_text)

def extract_phones(soup: BeautifulSoup) -> List[str]:
    # Flexible phone number regex to match various formats
    PHONE_REGEX = r'\+?\d{0,3}[-\.\s\(\)]?\d{1,4}[-\.\s]?\d{1,4}[-\.\s]?\d{1,4}[-\.\s]?\d{1,4}'
    raw_text = soup.get_text()
    phones = re.findall(PHONE_REGEX, raw_text)
    
    # Filter out phone numbers with at least 10 characters
    filtered_phones = [phone for phone in phones if len(re.sub(r'[\s\(\)\-\.\+]', '', phone)) >= 10]
    
    return filtered_phones

def extract_address(soup: BeautifulSoup) -> List[str]:
    ADDRESS_REGEX = r'\d{1,4}[\s\w,.]*(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Court|Ct|Lane|Ln|extension|Area|TN|VIC)\b[\s\w,.-]*'
    raw_text = soup.get_text()
    addresses = re.findall(ADDRESS_REGEX, raw_text)

    # Further refinement to remove unwanted text
    refined_addresses = []
    for address in addresses:
        # Splitting by newline and taking the first part which is likely to be the address
        possible_address = address.split('\n')[0].strip()
        if possible_address:
            refined_addresses.append(possible_address)

    return refined_addresses


# Function to extract images with alt tags
def extract_images(soup):
    images = soup.find_all('img')
    image_data = [(img.get('src', ''), img.get('alt', '')) for img in images if img.get('src', '').startswith(('http', 'https'))]
    return image_data

# Function to extract YouTube embeds
def extract_youtube_embeds(soup):
    youtube_embeds = soup.find_all('iframe', src=re.compile(r'youtube\.com/embed/'))
    youtube_urls = [embed['src'] for embed in youtube_embeds]
    return youtube_urls

# Function to extract links
def extract_links(soup):
    links = soup.find_all('a', href=True)
    link_urls = [link['href'] for link in links]
    return link_urls

def is_contact_page(url: str) -> bool:
    parsed_url = urlparse(url)
    if parsed_url.path in ["", "/", "/index", "/home"]:
        return True  # Root domain
    relevant_keywords = ["contact", "about", "signup"]
    return any(keyword in parsed_url.path for keyword in relevant_keywords)

def format_extracted_data(data):
    """
    Format the extracted data by removing empty values and presenting it in a structured format.
    Filters out empty strings and uses different separators based on the key.
    """
    formatted_data = []
    for key, value in data.items():
        # Skip empty values or 'N/A'
        if not value or value == "N/A":
            continue

        # Filter out empty strings from lists
        if isinstance(value, list):
            value = list(filter(None, value))
            if not value:  # Skip if the list becomes empty after filtering
                continue
            
            # Use period as a separator for 'Content', comma for others
            separator = ". " if key == "Content" else ", "
            value = separator.join(value)

        formatted_data.append(f"{key}: {value}")

    return "\n".join(formatted_data)

def extract_content(soup, url):
    title = soup.title.string if soup.title else ''
    meta_description = extract_meta_description(soup)
    keywords = extract_keywords(soup)  # Ensure this function is implemented correctly

    # Assuming extract_youtube_embeds, extract_emails, and extract_phones are implemented
    youtube_embeds = extract_youtube_embeds(soup)
    youtube_embeds_text = ''
    if youtube_embeds:
        youtube_embeds_text = ', '.join(youtube_embeds) 

    # Remove non-content elements
    for script_or_style in soup(["script", "style",  "select"]):  # "header", "footer" can also be included if needed
        script_or_style.decompose()

    # Extract text
    text = soup.get_text(separator=' ', strip=True)
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)

    # Prepend meta information to the main text
    header_content = f"{title}\n{meta_description}\n{keywords}\n"
    text = f"{header_content}\n{text}\n{youtube_embeds_text}"
    #print(text)
    # Append contact information if applicable
    if is_contact_page(url):
        extracted_emails = extract_emails(soup)
        extracted_phones = extract_phones(soup)
        if extracted_emails:
            emails_text = ', '.join(extracted_emails)  # Joining array elements into a string
            text += f"\nYou can contact or reach out to our teams (sales/support) via email: {emails_text}"
        if extracted_phones:
            phones_text = ', '.join(extracted_phones)  # Joining array elements into a string
            text += f"\nYou can contact or reach out to our teams (sales/support) via phone: {phones_text}"

    # Consider adding YouTube embeds if needed, formatted as required

    return title, meta_description, keywords, text, youtube_embeds

def extract_content_old(soup, url):

    title = extract_title(soup, "title")
    meta_description = extract_meta_description(soup)
    keywords = extract_keywords(soup)  # Implement this function to extract keywords

    data = {
        "Title": title,
        "Meta Description": meta_description,
        "H1": extract_elements(soup, "h1"),
        "H2": extract_elements(soup, "h2"),
        "H3": extract_elements(soup, "h3"),
        "H4": extract_elements(soup, "h4"),
        "H5": extract_elements(soup, "h5"),
        #"Paragraphs": extract_elements(soup, "p"),
        "Content": extract_elements(soup,["p", "span", "div", "td"]), #["p", "div", "span"]),
        #"images" : extract_images(soup),
        "youtube_embeds" : extract_youtube_embeds(soup),
        #"links" : extract_links(soup),
    }

    if is_contact_page(url):
        extracted_emails = extract_emails(soup)
        extracted_phones = extract_phones(soup)
        if extracted_emails:
            data["Contact Details (Emails)"] = 'You can contact or reach out to our teams (sales / support) on email as follows: '
            data["Emails"] = extracted_emails
        if extracted_phones:
            data["Contact Details (Phone)"] = 'You can contact or reach out to our teams (sales / support)  on phone as follows for multiple regions: '
            data["Phone"] = extracted_phones

        #data["locations"] = extract_address(soup)
        #print(data["Contact Emails"], ":" , data["Contact Phone Numbers"])

    remove_unwanted_tags(soup, ("script", "style", "aside")) #, "footer", "header", "nav"
    formatted_output = format_extracted_data(data)

    #formatted_output = "\n".join([f"{key}: {value}" for key, value in data.items() if value])
    #return formatted_output
    return title, meta_description, keywords, formatted_output


def fetch_and_extract_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        extracted_content = extract_content(soup, url)
        #print(extracted_content)
        #print(extracted_content["phone"], ": ", extracted_content["email"] )

    except Exception as e:
        print(f"Error: {str(e)}")


#target_url = "https://www.technoduces.com/contact-us" #"https://www.technoduces.com/mobile-application-development"  #"https://www.nextbraintech.com/contact-us"

#fetch_and_extract_content(target_url)
