import tkinter as tk
from tkinter import ttk, scrolledtext
from datetime import datetime
import threading
import queue
import logging
import sys
import io
import urllib.request
import re
from multiprocessing import cpu_count, Pool
from newspaper import Article, Config
import os
import zipfile
import time
import shutil

# Configurare newspaper
config = Config()
config.browser_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
config.request_timeout = 30
config.memoize_articles = False

def extract_file_names(start_date: datetime, end_date: datetime) -> list:
    """Extrage numele fișierelor GDELT din intervalul de date specificat."""
    data_url = "http://data.gdeltproject.org/events/index.html"
    try:
        with urllib.request.urlopen(data_url) as response:
            html_content = response.read().decode('utf-8')
    except Exception as e:
        logging.error(f"Eroare la accesarea URL: {e}")
        return []

    file_pattern = r'\d{8}\.export\.CSV\.zip'
    file_names = re.findall(file_pattern, html_content)
    
    return [
        name for name in file_names 
        if start_date <= datetime.strptime(name[:8], '%Y%m%d') <= end_date
    ]

def extract_urls(file_content: str) -> list:
    """Extrage și validează URL-urile din conținutul fișierului."""
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    urls = re.findall(url_pattern, file_content.lower())
    valid_urls = [
        url for url in urls 
        if url.endswith(('.html', '.htm', '.asp', '.aspx', '.php'))
    ]
    return list(set(valid_urls))

def filter_duplicates(directory: str):
    """Elimină conținutul duplicat din articolele descărcate."""
    files = [f for f in os.listdir(directory) if f.endswith('.txt')]
    content_hash = {}
    
    for filename in files:
        filepath = os.path.join(directory, filename)
        try:
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read().strip()
                if not content:
                    os.remove(filepath)
                    continue
                
                content_hash_value = hash(content)
                if content_hash_value in content_hash:
                    os.remove(filepath)
                else:
                    content_hash[content_hash_value] = filepath
        except Exception as e:
            logging.error(f"Eroare la procesarea {filename}: {e}")

def filter_by_keywords(source_dir: str, keywords: str):
    """
    Filtrează articolele după cuvinte cheie (post-procesare ca verificare suplimentară).
    Spațiu între cuvinte înseamnă AND, virgulă înseamnă OR.
    """
    if not keywords.strip():
        return
        
    filtered_dir = source_dir + "_filtered"
    if not os.path.exists(filtered_dir):
        os.makedirs(filtered_dir)
    
    or_groups = [group.strip() for group in keywords.split(',')]
    keyword_groups = [group.split() for group in or_groups]
    
    logging.info(f"Se aplică filtrarea cu cuvintele cheie în directorul: {source_dir}")
    
    for filename in os.listdir(source_dir):
        if not filename.endswith('.txt'):
            continue
            
        source_path = os.path.join(source_dir, filename)
        try:
            with open(source_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read().lower()
                
            for and_keywords in keyword_groups:
                if all(keyword.lower() in content for keyword in and_keywords):
                    shutil.copy2(source_path, os.path.join(filtered_dir, filename))
                    logging.info(f"Articol filtrat găsit: {filename}")
                    break
                    
        except Exception as e:
            logging.error(f"Eroare la procesarea {filename} pentru filtrare: {e}")

def download_article(article_url: str, index: int, out_file_path: str, keywords: str = ""):
    """Descarcă și salvează conținutul articolului doar dacă conține cuvintele cheie."""
    try:
        logging.info(f"Se descarcă: {article_url}")
        
        article = Article(article_url, config=config)
        article.download()
        article.parse()
        
        if not article.text:
            raise ValueError("Nu s-a putut extrage conținut")
            
        content = article.text.lower()
        if keywords:
            keyword_groups = [group.split() for group in keywords.split(',')]
            if not any(all(keyword.lower() in content for keyword in and_keywords) for and_keywords in keyword_groups):
                logging.info(f"Articol ignorat (nu conține cuvinte cheie): {article_url}")
                return
                
        with open(out_file_path, 'w', encoding='utf-8', errors='ignore') as fw:
            fw.write(article.title + "\n\n" if article.title else "")
            fw.write(article.text)
            
        logging.info(f"Descărcare reușită: {article_url}")
        time.sleep(1)
            
    except Exception as e:
        logging.error(f"Eroare la descărcarea {article_url}: {e}")
        if os.path.exists(out_file_path):
            os.remove(out_file_path)

def process_gdelt_file(filename: str, output_dir: str, keywords: str = "") -> None:
    """Procesează un singur fișier GDELT, aplicând filtrarea inițială."""
    logging.info(f'Se procesează fișierul: {filename}')
    
    try:
        urllib.request.urlretrieve(
            f"http://data.gdeltproject.org/events/{filename}", 
            filename
        )
        
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall()
            
    except Exception as e:
        logging.error(f"Eroare la procesarea fișierului zip {filename}: {e}")
        return
    finally:
        if os.path.exists(filename):
            os.remove(filename)

    csv_filename = filename.replace('.zip', '')
    try:
        with open(csv_filename, 'r', encoding="utf-8", errors='ignore') as f:
            content = f.read()
            urls = extract_urls(content)
            
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            
        params = [
            (url, i, os.path.join(output_dir, f'news_{i+1}.txt'), keywords)
            for i, url in enumerate(urls)
            if not os.path.isfile(os.path.join(output_dir, f'news_{i+1}.txt'))
        ]
        
        with Pool(min(cpu_count(), 4)) as pool:
            pool.starmap(download_article, params)
            
        filter_duplicates(output_dir)
        
    except Exception as e:
        logging.error(f"Eroare la procesarea fișierului CSV {csv_filename}: {e}")
    finally:
        if os.path.exists(csv_filename):
            os.remove(csv_filename)

def process_files(start_date_str: str, end_date_str: str, keywords: str = "") -> None:
    """Funcția principală pentru procesarea fișierelor GDELT."""
    try:
        start_date = datetime.strptime(start_date_str, '%Y%m%d')
        end_date = datetime.strptime(end_date_str, '%Y%m%d')
    except ValueError:
        logging.error("Format dată invalid. Folosiți formatul YYYYMMDD.")
        return

    logging.info(f"Se procesează fișierele din perioada {start_date} până la {end_date}")
    
    file_names = extract_file_names(start_date, end_date)
    if not file_names:
        logging.error("Nu s-au găsit fișiere pentru intervalul specificat")
        return
        
    for filename in file_names:
        output_dir = os.path.join(os.getcwd(), filename.split('.')[0])
        process_gdelt_file(filename, output_dir, keywords)
    
    logging.info('Procesare finalizată cu succes.')

class RedirectText:
    def __init__(self, text_widget, queue):
        self.queue = queue
        self.text_widget = text_widget

    def write(self, string):
        self.queue.put(string)

    def flush(self):
        pass

class AnaziGUI:
    def __init__(self, root):
        self.root = root
        self.root.title("ANAZI - GDELT Article Downloader")
        self.root.geometry("800x600")
        
        self.root.grid_columnconfigure(0, weight=1)
        self.root.grid_rowconfigure(2, weight=1)
        
        main_frame = ttk.Frame(root, padding="10")
        main_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
        
        date_frame = ttk.LabelFrame(main_frame, text="Interval de Date", padding="5")
        date_frame.grid(row=0, column=0, columnspan=2, sticky=(tk.W, tk.E), pady=5)
        
        ttk.Label(date_frame, text="Data început (YYYYMMDD):").grid(row=0, column=0, padx=5)
        self.start_date = ttk.Entry(date_frame, width=10)
        self.start_date.grid(row=0, column=1, padx=5)
        
        ttk.Label(date_frame, text="Data sfârșit (YYYYMMDD):").grid(row=0, column=2, padx=5)
        self.end_date = ttk.Entry(date_frame, width=10)
        self.end_date.grid(row=0, column=3, padx=5)
        
        keyword_frame = ttk.LabelFrame(main_frame, text="Filtrare după cuvinte cheie", padding="5")
        keyword_frame.grid(row=1, column=0, columnspan=2, sticky=(tk.W, tk.E), pady=5)
        
        ttk.Label(keyword_frame, 
                 text="Sintaxă: cuvinte separate prin spațiu = AND, separate prin virgulă = OR\n" + 
                      "Exemplu: 'romania moldova' va găsi texte cu ambele cuvinte\n" +
                      "'romania, moldova' va găsi texte cu oricare dintre cuvinte").grid(
            row=0, column=0, columnspan=2, padx=5, pady=5)
        
        ttk.Label(keyword_frame, text="Cuvinte cheie:").grid(row=1, column=0, padx=5)
        self.keywords = ttk.Entry(keyword_frame, width=50)
        self.keywords.grid(row=1, column=1, padx=5)
        
        self.process_btn = ttk.Button(main_frame, text="Începe Procesarea", command=self.start_processing)
        self.process_btn.grid(row=2, column=0, columnspan=2, pady=10)
        
        progress_frame = ttk.LabelFrame(main_frame, text="Progres", padding="5")
        progress_frame.grid(row=3, column=0, columnspan=2, sticky=(tk.W, tk.E, tk.N, tk.S))
        progress_frame.grid_columnconfigure(0, weight=1)
        progress_frame.grid_rowconfigure(0, weight=1)
        
        self.log_text = scrolledtext.ScrolledText(progress_frame, wrap=tk.WORD, height=20)
        self.log_text.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
        
        self.status_var = tk.StringVar()
        self.status_var.set("Gata")
        self.status_bar = ttk.Label(root, textvariable=self.status_var, relief=tk.SUNKEN)
        self.status_bar.grid(row=1, column=0, sticky=(tk.W, tk.E))
        
        self.log_queue = queue.Queue()
        self.queue_check()
        
        sys.stdout = RedirectText(self.log_text, self.log_queue)
        sys.stderr = RedirectText(self.log_text, self.log_queue)
        
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[logging.StreamHandler(sys.stdout)]
        )

    def validate_dates(self):
        try:
            start = datetime.strptime(self.start_date.get(), '%Y%m%d')
            end = datetime.strptime(self.end_date.get(), '%Y%m%d')
            if start > end:
                raise ValueError("Data de început trebuie să fie înaintea datei de sfârșit")
            return True
        except ValueError as e:
            self.log_text.insert(tk.END, f"Eroare: {str(e)}\n")
            return False

    def start_processing(self):
        if not self.validate_dates():
            return
            
        self.process_btn.state(['disabled'])
        self.status_var.set("Se procesează...")
        
        process_thread = threading.Thread(
            target=self.process_files_thread,
            args=(self.start_date.get(), self.end_date.get(), self.keywords.get().strip())
        )
        process_thread.daemon = True
        process_thread.start()

    def process_files_thread(self, start_date, end_date, keywords):
        try:
            process_files(start_date, end_date, keywords)
            
            # Aplicăm filtrarea suplimentară (post-procesare)
            if keywords:
                base_dir = os.getcwd()
                for dirname in os.listdir(base_dir):
                    if dirname.endswith('.export.CSV'):
                        dir_path = os.path.join(base_dir, dirname)
                        if os.path.isdir(dir_path):
                            logging.info(f"Aplicăm filtrarea suplimentară pentru directorul: {dirname}")
                            filter_by_keywords(dir_path, keywords)
                
            self.log_queue.put("Procesare finalizată cu succes.\n")
        except Exception as e:
            self.log_queue.put(f"Eroare în timpul procesării: {str(e)}\n")
        finally:
            self.root.after(0, self.processing_complete)

    def processing_complete(self):
        self.process_btn.state(['!disabled'])
        self.status_var.set("Gata")

    def queue_check(self):
        while True:
            try:
                msg = self.log_queue.get_nowait()
                self.log_text.insert(tk.END, msg)
                self.log_text.see(tk.END)
            except queue.Empty:
                break
        self.root.after(100, self.queue_check)

def main():
    root = tk.Tk()
    app = AnaziGUI(root)
    root.mainloop()

if __name__ == "__main__":
    main()
