import os
import glob
import argparse
import unicodedata
from multiprocessing import Pool, cpu_count
from newspaper import Article
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from nltk.tokenize import sent_tokenize
from youtube_transcript_api import YouTubeTranscriptApi
import shutil
import subprocess

class UnifiedDocumentProcessor:
    def __init__(self):
        self.create_directories()
        # Ensure logs.txt exists
        if not os.path.exists('logs.txt'):
            open('logs.txt', 'a').close()
            print("Created empty logs.txt file")

    def create_directories(self):
        """Create necessary directories if they don't exist"""
        directories = ['documente', 'stiri', 'data_en_split', 'rezultate']
        for directory in directories:
            os.makedirs(directory, exist_ok=True)

    def remove_google_links(self, input_file="logs.txt"):
        """Remove Google links from the input file"""
        try:
            with open(input_file, "r") as file:
                lines = file.readlines()
            filtered_lines = [line for line in lines if "google" not in line]
            with open(input_file, "w") as file:
                file.writelines(filtered_lines)
            print(f"Google links removed from '{input_file}'")
        except Exception as e:
            print(f"Error processing links: {e}")

    def download_pdfs(self, logs_file="logs.txt"):
        """Download PDFs from URLs in logs file"""
        try:
            subprocess.run(['wget', '--no-check-certificate', '--tries=1', 
                          '--timeout=3', '-A', 'pdf', '-i', logs_file])
        except Exception as e:
            print(f"Error downloading PDFs: {e}")

    def process_youtube_links(self, input_file="logs.txt"):
        """Extract YouTube transcripts"""
        try:
            youtube_ids = []
            with open(input_file) as f:
                for line in f:
                    if "youtu.be" in line:
                        youtube_ids.append(line.split("youtu.be/")[1].split("\n")[0])
                    elif "youtube.com" in line:
                        youtube_ids.append(line.split("youtube.com/watch?v=")[1].split("\n")[0])

            if youtube_ids:
                with open("transcripts.txt", "w", encoding='utf-8') as out_file:
                    for video_id in youtube_ids:
                        try:
                            transcript = YouTubeTranscriptApi.get_transcript(video_id)
                            script = " ".join(t["text"] for t in transcript if t["text"] != '[Music]')
                            out_file.write(script + "\n")
                        except:
                            print(f"Could not process video ID: {video_id}")
        except Exception as e:
            print(f"Error processing YouTube links: {e}")

    def download_article(self, article_url, index):
        """Download and extract text from news article"""
        out_file_path = os.path.join('stiri', f"news_{index}.txt")
        try:
            article = Article(article_url)
            article.download()
            article.parse()
            with open(out_file_path, 'wb') as fw:
                fw.write(article.text.encode('utf-8'))
            print(f"Downloaded: [{article_url}] [OK]")
        except Exception as e:
            print(f"Error downloading {article_url}: {e}")

    def process_articles(self, urls_file="logs.txt"):
        """Process all articles in parallel"""
        try:
            with open(urls_file, 'r') as fr:
                urls = [(line.strip(), i) for i, line in enumerate(fr.readlines())]
            
            with Pool(cpu_count()) as pool:
                pool.starmap(self.download_article, urls)
        except Exception as e:
            print(f"Error processing articles: {e}")

    def pdf_to_txt(self, input_pdf, output_txt):
        """Convert PDF to text"""
        outfp = None
        device = None
        try:
            outfp = open(output_txt, 'w', encoding='utf8')
            rsrcmgr = PDFResourceManager(caching=True)
            device = TextConverter(rsrcmgr, outfp, laparams=LAParams())
            
            with open(input_pdf, 'rb') as fp:
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                for page in PDFPage.get_pages(fp, check_extractable=True):
                    interpreter.process_page(page)
        except Exception as e:
            print(f"Error converting PDF {input_pdf}: {e}")
        finally:
            if device:
                device.close()
            if outfp:
                outfp.close()

    def extract_pdf_contents(self):
        """Extract text from all PDFs in documente folder"""
        try:
            pdf_files = glob.glob("documente/*.pdf")
            for pdf_file in pdf_files:
                output_file = os.path.join('stiri', f"{os.path.basename(pdf_file)}.txt")
                self.pdf_to_txt(pdf_file, output_file)
        except Exception as e:
            print(f"Error extracting PDF contents: {e}")

    def split_documents(self, input_dir='stiri', output_dir='data_en_split'):
        """Split documents into smaller chunks"""
        try:
            txt_files = glob.glob(f"{input_dir}/*.txt")
            
            for file_path in txt_files:
                try:
                    with open(file_path, 'rt', encoding='utf8', errors='ignore') as f:
                        content = f.read()
                    sentences = sent_tokenize(content)
                    
                    buffer = ''
                    counter = 0
                    for sentence in sentences:
                        buffer += sentence
                        if len(buffer) > 1000:
                            output_path = f'{output_dir}/{os.path.basename(file_path)}.{counter:03d}.txt'
                            with open(output_path, 'wt', encoding="utf-8") as f:
                                f.write(buffer)
                            buffer = ''
                            counter += 1
                    
                    if buffer:
                        output_path = f'{output_dir}/{os.path.basename(file_path)}.{counter:03d}.txt'
                        with open(output_path, 'wt', encoding="utf-8") as f:
                            f.write(buffer)
                except Exception as e:
                    print(f"Error processing file {file_path}: {e}")
        except Exception as e:
            print(f"Error splitting documents: {e}")

    def filter_documents(self, input_dir):
        """Remove empty and duplicate documents"""
        try:
            txt_files = glob.glob(f'{input_dir}/*.txt')
            content_hash = {}
            
            for file_path in txt_files:
                try:
                    with open(file_path, encoding='utf8', errors='ignore') as f:
                        content = f.read()
                    
                    if len(content) == 0 or len(content) > 3000:
                        os.unlink(file_path)
                        continue
                    
                    file_hash = hash(content)
                    if file_hash in content_hash:
                        os.unlink(file_path)
                    else:
                        content_hash[file_hash] = True
                except Exception as e:
                    print(f"Error processing file {file_path}: {e}")
        except Exception as e:
            print(f"Error filtering documents: {e}")

    def process_keywords(self, folder='data_en_split'):
        """Process documents based on keywords"""
        try:
            def remove_accents(text):
                text = unicodedata.normalize('NFD', text)
                return text.encode('ascii', 'ignore').decode("utf-8")

            while True:
                try:
                    n = int(input("Introduceti numarul de cuvinte cheie: "))
                    break
                except ValueError:
                    print("Va rog introduceti un numar valid.")

            keywords = []
            print("Introduceti cuvintele cheie, cate unul pe linie:")
            for i in range(n):
                keyword = input(f"Cuvant cheie {i+1}: ")
                keywords.append(remove_accents(keyword))

            result_path = os.path.join('rezultate', 'result.txt')
            if os.path.exists(result_path):
                os.remove(result_path)

            print(f"\nCaut texte care conțin toate cuvintele cheie: {', '.join(keywords)}")
            found_count = 0

            for file in os.listdir(folder):
                if file.endswith('.txt'):
                    try:
                        with open(os.path.join(folder, file), encoding='utf-8') as f:
                            content = remove_accents(f.read())
                        
                        if all(keyword.lower() in content.lower() for keyword in keywords):
                            with open(result_path, 'a', encoding='utf-8') as f2:
                                f2.write(content + '\n' + '-'*50 + '\n\n')
                            found_count += 1
                    except Exception as e:
                        print(f"Eroare la procesarea fișierului {file}: {e}")

            print(f"\nAm găsit {found_count} texte care conțin toate cuvintele cheie.")
            print(f"Rezultatul a fost salvat în: {os.path.abspath(result_path)}")

            # Deschide fișierul rezultat
            try:
                if os.name == 'nt':  # Pentru Windows
                    os.startfile(result_path)
                else:  # Pentru Linux/Mac
                    import subprocess
                    subprocess.run(['xdg-open', result_path])
                print("\nAm deschis fișierul result.txt pentru citire.")
            except Exception as e:
                print(f"Nu am putut deschide fișierul rezultat: {e}")

        except Exception as e:
            print(f"Error processing keywords: {e}")

    def run_pipeline(self):
        """Execute the complete processing pipeline"""
        original_dir = os.getcwd()  # Salvăm directorul original
        
        try:
            print("Starting document processing pipeline...")
            
            # Step 1: Process logs and download content
            self.remove_google_links()
            
            # Create documente directory if it doesn't exist
            os.makedirs('documente', exist_ok=True)
            
            # Copy logs.txt to documente folder
            shutil.copy2('logs.txt', 'documente/logs.txt')
            print("Copied logs.txt to documente folder")
            
            # Change to documente directory for downloads
            os.chdir('documente')
            self.download_pdfs()
            
            # Ne întoarcem în directorul original pentru restul operațiilor
            os.chdir(original_dir)
            
            self.process_youtube_links()
            self.process_articles()
            
            # Step 2: Process PDFs
            print("\nExtracting text from PDFs...")
            self.extract_pdf_contents()
            
            # Step 3: Split and filter documents
            print("\nSplitting and filtering documents...")
            self.split_documents()
            self.filter_documents('data_en_split')
            
            # Step 4: Process keywords
            print("\nProcessing keywords...")
            self.process_keywords()
            
            print("\nProcessing pipeline completed successfully!")
            
        except Exception as e:
            print(f"\nError in processing pipeline: {str(e)}")
            print("Please check if all required files exist and try again.")
            
        finally:
            # Ne asigurăm că ne întoarcem în directorul original chiar și în caz de eroare
            if os.getcwd() != original_dir:
                os.chdir(original_dir)
            print("Returned to original directory.")

if __name__ == "__main__":
    processor = UnifiedDocumentProcessor()
    processor.run_pipeline()