import tkinter as tk
from tkinter import ttk, scrolledtext
from datetime import datetime
import threading
import queue
import logging
import sys
import io
import urllib.request
import re
from multiprocessing import cpu_count, Pool
from newspaper import Article, Config
import os
import zipfile
import time

# Configurare newspaper
config = Config()
config.browser_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
config.request_timeout = 30
config.memoize_articles = False

def extract_file_names(start_date: datetime, end_date: datetime) -> list:
    """Extrage numele fișierelor GDELT din intervalul de date specificat."""
    data_url = "http://data.gdeltproject.org/events/index.html"
    try:
        with urllib.request.urlopen(data_url) as response:
            html_content = response.read().decode('utf-8')
    except Exception as e:
        logging.error(f"Eroare la accesarea URL: {e}")
        return []

    file_pattern = r'\d{8}\.export\.CSV\.zip'
    file_names = re.findall(file_pattern, html_content)
    
    return [
        name for name in file_names 
        if start_date <= datetime.strptime(name[:8], '%Y%m%d') <= end_date
    ]

def extract_urls(file_content: str) -> list:
    """Extrage și validează URL-urile din conținutul fișierului."""
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    urls = re.findall(url_pattern, file_content.lower())
    valid_urls = [
        url for url in urls 
        if url.endswith(('.html', '.htm', '.asp', '.aspx', '.php'))
    ]
    return list(set(valid_urls))

def filter_duplicates(directory: str):
    """Elimină conținutul duplicat din articolele descărcate."""
    files = [f for f in os.listdir(directory) if f.endswith('.txt')]
    content_hash = {}
    
    for filename in files:
        filepath = os.path.join(directory, filename)
        try:
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read().strip()
                if not content:
                    os.remove(filepath)
                    continue
                
                content_hash_value = hash(content)
                if content_hash_value in content_hash:
                    os.remove(filepath)
                else:
                    content_hash[content_hash_value] = filepath
        except Exception as e:
            logging.error(f"Eroare la procesarea {filename}: {e}")

def download_article(article_url: str, index: int, out_file_path: str):
    """Descarcă și salvează conținutul articolului."""
    try:
        logging.info(f"Se descarcă: {article_url}")
        
        # Folosim newspaper3k cu configurația personalizată
        article = Article(article_url, config=config)
        article.download()
        article.parse()
        
        # Verificăm dacă avem text
        if not article.text:
            raise ValueError("Nu s-a putut extrage conținut")
            
        # Salvăm textul extras
        with open(out_file_path, 'w', encoding='utf-8', errors='ignore') as fw:
            fw.write(article.title + "\n\n" if article.title else "")
            fw.write(article.text)
            
        logging.info(f"Descărcare reușită: {article_url}")
        time.sleep(1)  # Pauză pentru a nu supraîncărca serverele
            
    except Exception as e:
        logging.error(f"Eroare la descărcarea {article_url}: {e}")
        if os.path.exists(out_file_path):
            os.remove(out_file_path)

def process_gdelt_file(filename: str, output_dir: str) -> None:
    """Procesează un singur fișier GDELT."""
    logging.info(f'Se procesează fișierul: {filename}')
    
    try:
        urllib.request.urlretrieve(
            f"http://data.gdeltproject.org/events/{filename}", 
            filename
        )
        
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall()
            
    except Exception as e:
        logging.error(f"Eroare la procesarea fișierului zip {filename}: {e}")
        return
    finally:
        if os.path.exists(filename):
            os.remove(filename)

    csv_filename = filename.replace('.zip', '')
    try:
        with open(csv_filename, 'r', encoding="utf-8", errors='ignore') as f:
            content = f.read()
            urls = extract_urls(content)
            
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            
        params = [
            (url, i, os.path.join(output_dir, f'news_{i+1}.txt'))
            for i, url in enumerate(urls)
            if not os.path.isfile(os.path.join(output_dir, f'news_{i+1}.txt'))
        ]
        
        # Folosim un număr mai mic de procese pentru a evita supraîncărcarea
        with Pool(min(cpu_count(), 4)) as pool:
            pool.starmap(download_article, params)
            
        filter_duplicates(output_dir)
        
    except Exception as e:
        logging.error(f"Eroare la procesarea fișierului CSV {csv_filename}: {e}")
    finally:
        if os.path.exists(csv_filename):
            os.remove(csv_filename)

def process_files(start_date_str: str, end_date_str: str) -> None:
    """Funcția principală pentru procesarea fișierelor GDELT."""
    try:
        start_date = datetime.strptime(start_date_str, '%Y%m%d')
        end_date = datetime.strptime(end_date_str, '%Y%m%d')
    except ValueError:
        logging.error("Format dată invalid. Folosiți formatul YYYYMMDD.")
        return

    logging.info(f"Se procesează fișierele din perioada {start_date} până la {end_date}")
    
    file_names = extract_file_names(start_date, end_date)
    if not file_names:
        logging.error("Nu s-au găsit fișiere pentru intervalul specificat")
        return
        
    for filename in file_names:
        output_dir = os.path.join(os.getcwd(), filename.split('.')[0])
        process_gdelt_file(filename, output_dir)
    
    logging.info('Procesare finalizată cu succes.')

class RedirectText:
    def __init__(self, text_widget, queue):
        self.queue = queue
        self.text_widget = text_widget

    def write(self, string):
        self.queue.put(string)

    def flush(self):
        pass

class AnaziGUI:
    def __init__(self, root):
        self.root = root
        self.root.title("ANAZI - GDELT Article Downloader")
        self.root.geometry("800x600")
        
        # Configurare grid
        self.root.grid_columnconfigure(0, weight=1)
        self.root.grid_rowconfigure(2, weight=1)
        
        # Frame principal
        main_frame = ttk.Frame(root, padding="10")
        main_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
        
        # Frame pentru date
        date_frame = ttk.LabelFrame(main_frame, text="Interval de Date", padding="5")
        date_frame.grid(row=0, column=0, columnspan=2, sticky=(tk.W, tk.E), pady=5)
        
        # Data de început
        ttk.Label(date_frame, text="Data început (YYYYMMDD):").grid(row=0, column=0, padx=5)
        self.start_date = ttk.Entry(date_frame, width=10)
        self.start_date.grid(row=0, column=1, padx=5)
        
        # Data de sfârșit
        ttk.Label(date_frame, text="Data sfârșit (YYYYMMDD):").grid(row=0, column=2, padx=5)
        self.end_date = ttk.Entry(date_frame, width=10)
        self.end_date.grid(row=0, column=3, padx=5)
        
        # Buton de procesare
        self.process_btn = ttk.Button(main_frame, text="Începe Procesarea", command=self.start_processing)
        self.process_btn.grid(row=1, column=0, columnspan=2, pady=10)
        
        # Frame pentru progres
        progress_frame = ttk.LabelFrame(main_frame, text="Progres", padding="5")
        progress_frame.grid(row=2, column=0, columnspan=2, sticky=(tk.W, tk.E, tk.N, tk.S))
        progress_frame.grid_columnconfigure(0, weight=1)
        progress_frame.grid_rowconfigure(0, weight=1)
        
        # Zonă de text pentru log
        self.log_text = scrolledtext.ScrolledText(progress_frame, wrap=tk.WORD, height=20)
        self.log_text.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
        
        # Bară de stare
        self.status_var = tk.StringVar()
        self.status_var.set("Gata")
        self.status_bar = ttk.Label(root, textvariable=self.status_var, relief=tk.SUNKEN)
        self.status_bar.grid(row=1, column=0, sticky=(tk.W, tk.E))
        
        # Coadă pentru logging
        self.log_queue = queue.Queue()
        self.queue_check()
        
        # Redirectare stdout și stderr
        sys.stdout = RedirectText(self.log_text, self.log_queue)
        sys.stderr = RedirectText(self.log_text, self.log_queue)
        
        # Configurare logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[logging.StreamHandler(sys.stdout)]
        )

    def validate_dates(self):
        """Validează datele introduse."""
        try:
            start = datetime.strptime(self.start_date.get(), '%Y%m%d')
            end = datetime.strptime(self.end_date.get(), '%Y%m%d')
            if start > end:
                raise ValueError("Data de început trebuie să fie înaintea datei de sfârșit")
            return True
        except ValueError as e:
            self.log_text.insert(tk.END, f"Eroare: {str(e)}\n")
            return False

    def start_processing(self):
        """Începe procesarea într-un thread separat."""
        if not self.validate_dates():
            return
            
        self.process_btn.state(['disabled'])
        self.status_var.set("Se procesează...")
        
        process_thread = threading.Thread(
            target=self.process_files_thread,
            args=(self.start_date.get(), self.end_date.get())
        )
        process_thread.daemon = True
        process_thread.start()

    def process_files_thread(self, start_date, end_date):
        """Funcție thread pentru procesarea fișierelor."""
        try:
            process_files(start_date, end_date)
            self.log_queue.put("Procesare finalizată cu succes.\n")
        except Exception as e:
            self.log_queue.put(f"Eroare în timpul procesării: {str(e)}\n")
        finally:
            self.root.after(0, self.processing_complete)

    def processing_complete(self):
        """Resetează GUI-ul după terminarea procesării."""
        self.process_btn.state(['!disabled'])
        self.status_var.set("Gata")

    def queue_check(self):
        """Verifică coada pentru mesaje noi de log."""
        while True:
            try:
                msg = self.log_queue.get_nowait()
                self.log_text.insert(tk.END, msg)
                self.log_text.see(tk.END)
            except queue.Empty:
                break
        self.root.after(100, self.queue_check)

def main():
    root = tk.Tk()
    app = AnaziGUI(root)
    root.mainloop()

if __name__ == "__main__":
    main()