import asyncio
import aiohttp
from aiohttp import ClientSession, TCPConnector, ClientTimeout
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from bs4 import BeautifulSoup
import re
import os
import logging
from colorama import init, Fore, Style
import gzip
import pickle
from typing import List, Optional, Tuple
from dataclasses import dataclass
import time
from urllib.parse import urlparse
from enum import Enum
import argparse
from tqdm import tqdm
import aiofiles
import tkinter as tk
from tkinter import ttk, scrolledtext, messagebox
import threading
from ttkthemes import ThemedTk
import matplotlib.pyplot as plt
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
from matplotlib.figure import Figure
import queue

# Initialize Colorama and Logging
init(autoreset=True)
logging.basicConfig(filename='processing.log', level=logging.DEBUG,
                   format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize VADER sentiment analyzer
nltk.download('vader_lexicon', quiet=True)
sid = SentimentIntensityAnalyzer()

class SearchMode(Enum):
    OR = "sau"
    AND = "și"

@dataclass
class ProcessedLink:
    url: str
    content: str
    sentiment_score: float
    timestamp: str = ""

class WebScraper:
    def __init__(self, session: ClientSession):
        self.session = session

    async def extract_text(self, url: str) -> str:
        try:
            async with self.session.get(url) as response:
                if response.status == 200:
                    html = await response.text()
                    soup = BeautifulSoup(html, 'html.parser')
                    
                    # Remove script and style elements
                    for script in soup(["script", "style"]):
                        script.decompose()
                    
                    # Get text and clean it
                    text = soup.get_text()
                    lines = (line.strip() for line in text.splitlines())
                    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
                    text = ' '.join(chunk for chunk in chunks if chunk)
                    
                    return text
                return ""
        except Exception as e:
            logging.error(f"Error extracting text from {url}: {e}")
            return ""

class WebCrawler:
    def __init__(self, keywords: List[str], sentiment_threshold: float, search_mode: SearchMode, max_concurrent_requests: int = 10):
        self.keywords = keywords
        self.sentiment_threshold = sentiment_threshold
        self.search_mode = search_mode
        self.session: Optional[ClientSession] = None
        self.rate_limit = asyncio.Semaphore(max_concurrent_requests)
        self.timeout = ClientTimeout(total=30)
        self.scraper = None

    async def download_file(self, url: str, local_filename: str) -> bool:
        print(f"{Fore.BLUE}Se descarcă fișierul de la: {url}{Style.RESET_ALL}")
        try:
            async with self.rate_limit:
                async with self.session.get(url, timeout=self.timeout) as response:
                    response.raise_for_status()
                    async with aiofiles.open(local_filename, 'wb') as f:
                        await f.write(await response.read())
            print(f"{Fore.GREEN}Fișierul a fost salvat ca {local_filename}{Style.RESET_ALL}")
            logging.info(f"Fișierul a fost salvat ca {local_filename}")
            return True
        except Exception as e:
            print(f"{Fore.RED}Eroare la descărcarea fișierului {url}: {e}{Style.RESET_ALL}")
            logging.error(f"Eroare la descărcarea fișierului {url}: {e}")
            return False

    @staticmethod
    async def save_extracted_text(link: str, text: str) -> None:
        result_filename = f"extracted_text_{urlparse(link).netloc}_{int(time.time())}.txt"
        result_folder = "extracted_texts"
        os.makedirs(result_folder, exist_ok=True)
        result_path = os.path.join(result_folder, result_filename)
        try:
            async with aiofiles.open(result_path, 'w', encoding='utf-8') as f:
                await f.write(f"Link: {link}\n\nText extras:\n{text}\n")
            print(f"{Fore.GREEN}Textul extras a fost salvat în {result_path}{Style.RESET_ALL}")
            logging.info(f"Textul extras a fost salvat în {result_path}")
        except Exception as e:
            print(f"{Fore.RED}Eroare la salvarea textului extras în {result_path}: {e}{Style.RESET_ALL}")
            logging.error(f"Eroare la salvarea textului extras în {result_path}: {e}")

    async def extract_text_from_link(self, link: str) -> str:
        try:
            return await self.scraper.extract_text(link)
        except Exception as e:
            logging.error(f"Eroare neașteptată la procesarea linkului {link}: {e}")
            return ""

    @staticmethod
    def analyze_sentiment(text: str) -> float:
        sentiment = sid.polarity_scores(text)
        return sentiment['neg']

    def contains_keywords(self, text: str) -> bool:
        if self.search_mode == SearchMode.OR:
            return any(keyword.lower() in text.lower() for keyword in self.keywords)
        elif self.search_mode == SearchMode.AND:
            return all(keyword.lower() in text.lower() for keyword in self.keywords)

    async def process_link(self, link: str) -> Optional[ProcessedLink]:
        max_retries = 3
        for attempt in range(max_retries):
            try:
                print(f"{Fore.CYAN}Procesare link: {link} (încercarea {attempt + 1}/{max_retries}){Style.RESET_ALL}")
                async with self.rate_limit:
                    content = await self.extract_text_from_link(link)
                if content and len(content) >= 700:
                    if self.contains_keywords(content):
                        negative_sentiment = self.analyze_sentiment(content)
                        print(f"{Fore.CYAN}Scor sentiment negativ pentru linkul {link}: {negative_sentiment}{Style.RESET_ALL}")
                        
                        text_preview = content[:100].replace('\n', ' ')
                        print(f"{Fore.YELLOW}Preview text: {text_preview}...{Style.RESET_ALL}")

                        if negative_sentiment > self.sentiment_threshold:
                            print(f"{Fore.YELLOW}Sentiment negativ ridicat detectat în linkul: {link}{Style.RESET_ALL}")
                            await self.save_extracted_text(link, content)
                            return ProcessedLink(
                                url=link,
                                content=content,
                                sentiment_score=negative_sentiment,
                                timestamp=time.strftime("%H:%M:%S")
                            )
                        else:
                            print(f"{Fore.GREEN}Nu s-a detectat un sentiment negativ ridicat în linkul: {link}{Style.RESET_ALL}")
                    else:
                        print(f"{Fore.YELLOW}Textul de la linkul {link} nu conține cuvintele cheie căutate.{Style.RESET_ALL}")
                else:
                    print(f"{Fore.YELLOW}Textul de la linkul {link} este insuficient pentru analiză.{Style.RESET_ALL}")
                return None
            except Exception as e:
                print(f"{Fore.RED}Eroare la procesarea linkului {link} (încercarea {attempt + 1}/{max_retries}): {e}{Style.RESET_ALL}")
                logging.error(f"Eroare la procesarea linkului {link} (încercarea {attempt + 1}/{max_retries}): {e}")
                if attempt == max_retries - 1:
                    print(f"{Fore.RED}Abandonarea procesării linkului {link} după {max_retries} încercări{Style.RESET_ALL}")
                else:
                    await asyncio.sleep(2 ** attempt)  # Exponential backoff
        return None

    async def process_links(self, links: List[str]) -> List[ProcessedLink]:
        print(f"{Fore.BLUE}Se procesează {len(links)} linkuri...{Style.RESET_ALL}")
        tasks = [self.process_link(link) for link in links]
        results = await asyncio.gather(*tasks, return_exceptions=True)
        return [result for result in results if isinstance(result, ProcessedLink)]

    async def get_latest_datasets(self) -> List[str]:
        print(f"{Fore.BLUE}Se obțin datele disponibile de la Common Crawl...{Style.RESET_ALL}")
        url = "https://data.commoncrawl.org/crawl-data/index.html"
        try:
            async with self.rate_limit:
                async with self.session.get(url, timeout=self.timeout) as response:
                    response.raise_for_status()
                    content = await response.text()

            soup = BeautifulSoup(content, 'html.parser')
            rows = soup.find_all('tr')
            datasets = [row.find_all('td')[0].text.strip() for row in rows[1:]]
            print(f"{Fore.GREEN}Numărul de seturi de date disponibile: {len(datasets)}{Style.RESET_ALL}")
            logging.info(f"Numărul de seturi de date disponibile: {len(datasets)}")
            return datasets
        except Exception as e:
            print(f"{Fore.RED}Eroare la obținerea seturilor de date: {e}{Style.RESET_ALL}")
            logging.error(f"Eroare la obținerea seturilor de date: {e}")
            return []

    async def get_wet_file_paths(self, dataset_id: str) -> List[str]:
        dataset_url = f"https://data.commoncrawl.org/crawl-data/{dataset_id}/wet.paths.gz"
        local_gz_file = f"{dataset_id}_wet.paths.gz"

        if await self.download_file(dataset_url, local_gz_file):
            try:
                async with aiofiles.open(local_gz_file, 'rb') as f:
                    content = await f.read()
                    with gzip.open(gzip.io.BytesIO(content), 'rt') as gz_file:
                        wet_file_paths = gz_file.read().splitlines()
                os.remove(local_gz_file)
                return wet_file_paths
            except Exception as e:
                print(f"{Fore.RED}Eroare la citirea fișierului {local_gz_file}: {e}{Style.RESET_ALL}")
                logging.error(f"Eroare la citirea fișierului {local_gz_file}: {e}")
                return []
        else:
            return []

    async def download_wet_file(self, wet_file_path: str) -> Optional[str]:
        base_url = "https://data.commoncrawl.org/"
        local_filename = wet_file_path.split('/')[-1]
        full_url = base_url + wet_file_path

        if not os.path.exists(local_filename):
            if await self.download_file(full_url, local_filename):
                return local_filename
        return None

    async def process_wet_file(self, wet_file_path: str) -> List[str]:
        local_wet_file = await self.download_wet_file(wet_file_path)

        if not local_wet_file:
            return []

        try:
            print(f"{Fore.CYAN}Se procesează fișierul WET: {local_wet_file}{Style.RESET_ALL}")

            async with aiofiles.open(local_wet_file, 'rb') as f:
                content = await f.read()
                with gzip.open(gzip.io.BytesIO(content), 'rt', encoding='utf-8') as gz_file:
                    lines = gz_file.readlines()

            all_links = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' '.join(lines))

            print(f"{Fore.GREEN}Extrase {len(all_links)} linkuri din {local_wet_file}{Style.RESET_ALL}")
            logging.info(f"Extrase {len(all_links)} linkuri din {local_wet_file}")

            os.remove(local_wet_file)

            return all_links

        except Exception as e:
            print(f"{Fore.RED}Eroare la procesarea fișierului {local_wet_file}: {e}{Style.RESET_ALL}")
            logging.error(f"Eroare la procesarea fișierului {local_wet_file}: {e}")
            return []

    async def run(self, resume_progress: bool = False) -> None:
        connector = TCPConnector(limit=100)
        async with ClientSession(connector=connector, timeout=self.timeout) as session:
            self.session = session
            self.scraper = WebScraper(session)
            
            if resume_progress:
                dataset_index, wet_file_index = await self.load_progress()
            else:
                dataset_index, wet_file_index = 0, 0

            datasets = await self.get_latest_datasets()

            try:
                for i in range(dataset_index, len(datasets)):
                    dataset_id = datasets[i]
                    print(f"{Fore.GREEN}Se procesează setul de date: {dataset_id}{Style.RESET_ALL}")

                    wet_file_paths = await self.get_wet_file_paths(dataset_id)

                    for j in tqdm(range(wet_file_index, len(wet_file_paths)), desc="Procesare fișiere WET"):
                        wet_file_path = wet_file_paths[j]
                        all_links = await self.process_wet_file(wet_file_path)

                        if all_links:
                            processed_links = await self.process_links(all_links)
                            print(f"{Fore.BLUE}Procesare completă pentru {wet_file_path}, {len(processed_links)} linkuri au fost extrase și analizate.{Style.RESET_ALL}")

                        await self.save_progress(i, j + 1)

                    wet_file_index = 0
                    await self.save_progress(i + 1, wet_file_index)

            except KeyboardInterrupt:
                print(f"{Fore.RED}Execuția a fost întreruptă de utilizator. Se salvează progresul...{Style.RESET_ALL}")
                await self.save_progress(dataset_index, wet_file_index)

    @staticmethod
    async def save_progress(dataset_index: int, wet_file_index: int) -> None:
        async with aiofiles.open('progress.pickle', 'wb') as f:
            await f.write(pickle.dumps((dataset_index, wet_file_index)))

    @staticmethod
    async def load_progress() -> Tuple[int, int]:
        if os.path.exists('progress.pickle'):
            async with aiofiles.open('progress.pickle', 'rb') as f:
                content = await f.read()
                return pickle.loads(content)
        else:
            return 0, 0

class WebCrawlerGUI:
    def __init__(self):
        self.root = ThemedTk(theme="arc")
        self.root.title("Web Crawler & Sentiment Analyzer")
        self.root.geometry("1200x800")
        
        # Initialize variables
        self.running = False
        self.crawler = None
        self.setup_gui()
        
    def setup_gui(self):
        # Create main containers
        self.create_input_frame()
        self.create_output_frame()
        self.create_status_frame()
        
    def create_input_frame(self):
        input_frame = ttk.LabelFrame(self.root, text="Configurare", padding="5")
        input_frame.pack(fill="x", padx=5, pady=5)
        
        # Keywords
        ttk.Label(input_frame, text="Cuvinte cheie (separate prin virgulă):").pack(anchor="w")
        self.keywords_entry = ttk.Entry(input_frame, width=50)
        self.keywords_entry.pack(fill="x", padx=5, pady=2)
        
        # Search mode
        ttk.Label(input_frame, text="Mod de căutare:").pack(anchor="w")
        self.search_mode = ttk.Combobox(input_frame, values=["sau", "și"], state="readonly")
        self.search_mode.set("sau")
        self.search_mode.pack(padx=5, pady=2)
        
        # Sentiment threshold
        ttk.Label(input_frame, text="Prag sentiment negativ:").pack(anchor="w")
        self.threshold_entry = ttk.Entry(input_frame)
        self.threshold_entry.insert(0, "0.107")
        self.threshold_entry.pack(padx=5, pady=2)
        
        # Max concurrent requests
        ttk.Label(input_frame, text="Cereri concurente maxime:").pack(anchor="w")
        self.concurrent_entry = ttk.Entry(input_frame)
        self.concurrent_entry.insert(0, "10")
        self.concurrent_entry.pack(padx=5, pady=2)
        
        # Buttons
        button_frame = ttk.Frame(input_frame)
        button_frame.pack(pady=5)
        
        self.start_button = ttk.Button(button_frame, text="Start", command=self.start_crawling)
        self.start_button.pack(side="left", padx=5)
        
        self.stop_button = ttk.Button(button_frame, text="Stop", command=self.stop_crawling, state="disabled")
        self.stop_button.pack(side="left", padx=5)
        
    def create_output_frame(self):
        output_frame = ttk.LabelFrame(self.root, text="Rezultate", padding="5")
        output_frame.pack(fill="both", expand=True, padx=5, pady=5)
        
        # Create notebook for tabs
        notebook = ttk.Notebook(output_frame)
        notebook.pack(fill="both", expand=True)
        
        # Results tab
        results_tab = ttk.Frame(notebook)
        notebook.add(results_tab, text="Rezultate")
        
        self.results_text = scrolledtext.ScrolledText(results_tab)
        self.results_text.pack(fill="both", expand=True)
        
        # Statistics tab
        stats_tab = ttk.Frame(notebook)
        notebook.add(stats_tab, text="Statistici")
        
        self.stats_text = scrolledtext.ScrolledText(stats_tab)
        self.stats_text.pack(fill="both", expand=True)
        
    def create_status_frame(self):
        status_frame = ttk.Frame(self.root, padding="5")
        status_frame.pack(fill="x", pady=5)
        
        self.progress_var = tk.DoubleVar()
        self.progress_bar = ttk.Progressbar(status_frame, variable=self.progress_var, mode='determinate')
        self.progress_bar.pack(fill="x", padx=5)
        
        self.status_var = tk.StringVar(value="Gata de start")
        status_label = ttk.Label(status_frame, textvariable=self.status_var)
        status_label.pack(pady=2)
        
    def start_crawling(self):
        # Get and validate inputs
        keywords = [k.strip() for k in self.keywords_entry.get().split(',') if k.strip()]
        if not keywords:
            messagebox.showerror("Eroare", "Introduceți cel puțin un cuvânt cheie!")
            return
            
        try:
            threshold = float(self.threshold_entry.get())
            if not 0 <= threshold <= 1:
                raise ValueError
        except ValueError:
            messagebox.showerror("Eroare", "Pragul trebuie să fie între 0 și 1!")
            return
            
        try:
            max_concurrent = int(self.concurrent_entry.get())
            if max_concurrent < 1:
                raise ValueError
        except ValueError:
            messagebox.showerror("Eroare", "Numărul de cereri concurente trebuie să fie pozitiv!")
            return
            
        # Update UI state
        self.running = True
        self.start_button.configure(state="disabled")
        self.stop_button.configure(state="normal")
        self.status_var.set("Se inițializează...")
        
        # Initialize crawler
        search_mode = SearchMode.OR if self.search_mode.get() == "sau" else SearchMode.AND
        self.crawler = WebCrawler(keywords, threshold, search_mode, max_concurrent)
        
        # Start crawler in separate thread
        threading.Thread(target=self.run_crawler, daemon=True).start()
        
    def stop_crawling(self):
        self.running = False
        self.status_var.set("Se oprește...")
        self.stop_button.configure(state="disabled")
        self.start_button.configure(state="normal")
        
    def run_crawler(self):
        try:
            asyncio.run(self.crawler.run())
        except Exception as e:
            self.status_var.set(f"Eroare: {str(e)}")
            logging.error(f"Eroare în timpul crawling-ului: {e}")
        finally:
            self.running = False
            self.start_button.configure(state="normal")
            self.stop_button.configure(state="disabled")
            self.status_var.set("Gata de start")
            
    def update_progress(self, value: float):
        self.progress_var.set(value)
        
    def update_status(self, status: str):
        self.status_var.set(status)
        
    def add_result(self, result: ProcessedLink):
        self.results_text.insert(tk.END, 
            f"\nURL: {result.url}\n"
            f"Scor sentiment: {result.sentiment_score:.3f}\n"
            f"Timestamp: {result.timestamp}\n"
            f"{'-' * 50}\n"
        )
        self.results_text.see(tk.END)
        
    def run(self):
        print_sentiment_explanation()
        self.root.mainloop()

def print_sentiment_explanation():
    print(f"{Fore.YELLOW}Explicație scor de sentiment:{Style.RESET_ALL}")
    print("Scorul de sentiment negativ variază între 0.0 și 1.0:")
    print("- 0.0 reprezintă absența totală a sentimentului negativ")
    print("- 1.0 reprezintă un sentiment extrem de negativ")
    print("Pentru texte normale, scorurile sunt de obicei sub 0.1")
    print("Un scor peste 0.3 poate indica conținut semnificativ negativ")
    print("Valoarea implicită recomandată pentru prag este 0.107\n")

if __name__ == "__main__":
    print(f"{Fore.CYAN}Bun venit la Web Crawler-ul cu analiză de sentiment!{Style.RESET_ALL}")
    print(f"{Fore.YELLOW}Funcționalități:{Style.RESET_ALL}")
    print("1. Căutare conținut web cu cuvinte cheie specificate")
    print("2. Analiză de sentiment a textelor găsite")
    print("3. Identificare conținut potențial negativ")
    print("4. Explorare date din Common Crawl")
    print(f"{Fore.YELLOW}Vă rugăm să utilizați acest script în mod responsabil.{Style.RESET_ALL}\n")
    
    app = WebCrawlerGUI()
    app.run()
