import os
import glob
import argparse
import unicodedata
from multiprocessing import Pool, cpu_count
from newspaper import Article
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from nltk.tokenize import sent_tokenize
from youtube_transcript_api import YouTubeTranscriptApi
import shutil
import subprocess

class UnifiedDocumentProcessor:
    def __init__(self):
        self.log_file = 'logs.txt'
        self.create_directories()
        self.ensure_log_file()

    def create_directories(self):
        """Create necessary directories if they don't exist"""
        directories = ['documente', 'stiri', 'data_en_split', 'rezultate']
        for directory in directories:
            os.makedirs(directory, exist_ok=True)

    def ensure_log_file(self):
        """Ensure logs.txt exists"""
        if not os.path.exists(self.log_file):
            open(self.log_file, 'a').close()
            print(f"Created empty {self.log_file} file")

    def remove_google_links(self):
        """Remove Google links from the input file"""
        try:
            with open(self.log_file, "r") as file:
                lines = file.readlines()
            filtered_lines = [line for line in lines if "google" not in line]
            with open(self.log_file, "w") as file:
                file.writelines(filtered_lines)
            print(f"Google links removed from '{self.log_file}'")
        except Exception as e:
            print(f"Error processing links: {e}")

    def download_pdfs(self):
        """Download PDFs from URLs in logs file"""
        try:
            subprocess.run(['wget', '--no-check-certificate', '--tries=1', 
                          '--timeout=3', '-A', 'pdf', '-i', self.log_file], check=True)
        except subprocess.CalledProcessError as e:
            print(f"Error downloading PDFs: {e}")

    def process_youtube_links(self):
        """Extract YouTube transcripts"""
        try:
            youtube_ids = []
            with open(self.log_file) as f:
                for line in f:
                    if "youtu.be" in line:
                        youtube_ids.append(line.split("youtu.be/")[1].strip())
                    elif "youtube.com" in line:
                        youtube_ids.append(line.split("youtube.com/watch?v=")[1].strip())

            if youtube_ids:
                with open("transcripts.txt", "w", encoding='utf-8') as out_file:
                    for video_id in youtube_ids:
                        try:
                            transcript = YouTubeTranscriptApi.get_transcript(video_id)
                            script = " ".join(t["text"] for t in transcript if t["text"] != '[Music]')
                            out_file.write(script + "\n")
                        except Exception as e:
                            print(f"Could not process video ID {video_id}: {e}")
        except Exception as e:
            print(f"Error processing YouTube links: {e}")

    def download_article(self, article_url, index):
        """Download and extract text from news article"""
        out_file_path = os.path.join('stiri', f"news_{index}.txt")
        try:
            article = Article(article_url)
            article.download()
            article.parse()
            with open(out_file_path, 'wb') as fw:
                fw.write(article.text.encode('utf-8'))
            print(f"Downloaded: [{article_url}] [OK]")
        except Exception as e:
            print(f"Error downloading {article_url}: {e}")

    def process_articles(self):
        """Process all articles in parallel"""
        try:
            with open(self.log_file, 'r') as fr:
                urls = [(line.strip(), i) for i, line in enumerate(fr.readlines())]

            with Pool(cpu_count()) as pool:
                pool.starmap(self.download_article, urls)
        except Exception as e:
            print(f"Error processing articles: {e}")

    def pdf_to_txt(self, input_pdf, output_txt):
        """Convert PDF to text"""
        try:
            with open(output_txt, 'w', encoding='utf8') as outfp, open(input_pdf, 'rb') as fp:
                rsrcmgr = PDFResourceManager(caching=True)
                device = TextConverter(rsrcmgr, outfp, laparams=LAParams())
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                for page in PDFPage.get_pages(fp, check_extractable=True):
                    interpreter.process_page(page)
        except Exception as e:
            print(f"Error converting PDF {input_pdf}: {e}")

    def extract_pdf_contents(self):
        """Extract text from all PDFs in documente folder"""
        try:
            pdf_files = glob.glob("documente/*.pdf")
            for pdf_file in pdf_files:
                output_file = os.path.join('stiri', f"{os.path.basename(pdf_file)}.txt")
                self.pdf_to_txt(pdf_file, output_file)
        except Exception as e:
            print(f"Error extracting PDF contents: {e}")

    def split_documents(self):
        """Split documents into smaller chunks"""
        try:
            txt_files = glob.glob("stiri/*.txt")

            for file_path in txt_files:
                with open(file_path, 'rt', encoding='utf8', errors='ignore') as f:
                    content = f.read()
                sentences = sent_tokenize(content)

                buffer = ''
                counter = 0
                for sentence in sentences:
                    buffer += sentence
                    if len(buffer) > 1000:
                        output_path = f'data_en_split/{os.path.basename(file_path)}.{counter:03d}.txt'
                        with open(output_path, 'wt', encoding="utf-8") as f:
                            f.write(buffer)
                        buffer = ''
                        counter += 1

                if buffer:
                    output_path = f'data_en_split/{os.path.basename(file_path)}.{counter:03d}.txt'
                    with open(output_path, 'wt', encoding="utf-8") as f:
                        f.write(buffer)
        except Exception as e:
            print(f"Error splitting documents: {e}")

    def filter_documents(self):
        """Remove empty and duplicate documents"""
        try:
            txt_files = glob.glob('data_en_split/*.txt')
            content_hash = {}

            for file_path in txt_files:
                with open(file_path, encoding='utf8', errors='ignore') as f:
                    content = f.read()

                if len(content) == 0 or len(content) > 3000:
                    os.unlink(file_path)
                    continue

                file_hash = hash(content)
                if file_hash in content_hash:
                    os.unlink(file_path)
                else:
                    content_hash[file_hash] = True
        except Exception as e:
            print(f"Error filtering documents: {e}")

    def process_keywords(self):
        """Process documents based on keywords"""
        try:
            def remove_accents(text):
                return unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")

            while True:
                try:
                    n = int(input("Introduceti numarul de cuvinte cheie: "))
                    break
                except ValueError:
                    print("Va rog introduceti un numar valid.")

            keywords = [remove_accents(input(f"Cuvant cheie {i+1}: ")) for i in range(n)]

            result_path = os.path.join('rezultate', 'result.txt')
            if os.path.exists(result_path):
                os.remove(result_path)

            found_count = 0

            for file in os.listdir('data_en_split'):
                if file.endswith('.txt'):
                    with open(os.path.join('data_en_split', file), encoding='utf-8') as f:
                        content = remove_accents(f.read())

                    if all(keyword.lower() in content.lower() for keyword in keywords):
                        with open(result_path, 'a', encoding='utf-8') as f2:
                            f2.write(content + '\n' + '-'*50 + '\n\n')
                        found_count += 1

            print(f"\nAm găsit {found_count} texte care conțin toate cuvintele cheie.")
            print(f"Rezultatul a fost salvat în: {os.path.abspath(result_path)}")

            if os.name == 'nt':
                os.startfile(result_path)
            else:
                subprocess.run(['xdg-open', result_path])
        except Exception as e:
            print(f"Error processing keywords: {e}")

    def run_pipeline(self):
        """Execute the complete processing pipeline"""
        original_dir = os.getcwd()
        try:
            print("Starting document processing pipeline...")

            self.remove_google_links()
            shutil.copy2(self.log_file, 'documente/logs.txt')
            os.chdir('documente')
            self.download_pdfs()

            os.chdir(original_dir)
            self.process_youtube_links()
            self.process_articles()

            print("\nExtracting text from PDFs...")
            self.extract_pdf_contents()

            print("\nSplitting and filtering documents...")
            self.split_documents()
            self.filter_documents()

            print("\nProcessing keywords...")
            self.process_keywords()

            print("\nProcessing pipeline completed successfully!")
        except Exception as e:
            print(f"\nError in processing pipeline: {str(e)}")
        finally:
            os.chdir(original_dir)
            print("Returned to original directory.")

if __name__ == "__main__":
    processor = UnifiedDocumentProcessor()
    processor.run_pipeline()
