import os
import glob
import argparse
import unicodedata
from multiprocessing import Pool, cpu_count
from newspaper import Article
from pdfminer.high_level import extract_text
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from nltk.tokenize import sent_tokenize
from youtube_transcript_api import YouTubeTranscriptApi
import shutil
import subprocess
import pikepdf

class UnifiedDocumentProcessor:
    def __init__(self):
        self.log_file = 'logs.txt'
        self.create_directories()
        self.ensure_log_file()

    def create_directories(self):
        """Create necessary directories if they don't exist"""
        directories = ['documente', 'stiri', 'data_en_split', 'rezultate']
        for directory in directories:
            os.makedirs(directory, exist_ok=True)

    def ensure_log_file(self):
        """Ensure logs.txt exists"""
        if not os.path.exists(self.log_file):
            open(self.log_file, 'a').close()
            print(f"Created empty {self.log_file} file")

    def remove_google_links(self):
        """Remove Google links from the input file"""
        try:
            with open(self.log_file, "r") as file:
                lines = file.readlines()
            filtered_lines = [line for line in lines if "google" not in line]
            with open(self.log_file, "w") as file:
                file.writelines(filtered_lines)
            print(f"Google links removed from '{self.log_file}'")
        except Exception as e:
            print(f"Error processing links: {e}")

    def download_pdfs(self):
        """Download PDFs from URLs in logs file"""
        try:
            subprocess.run(['wget', '--no-check-certificate', '--tries=1', 
                          '--timeout=3', '-A', 'pdf', '-i', self.log_file], check=True)
            print("PDF download completed successfully")
        except subprocess.CalledProcessError as e:
            print(f"Error downloading PDFs: {e}")

    def process_youtube_links(self):
        """Extract YouTube transcripts and save them to individual files in 'stiri' folder"""
        try:
            youtube_ids = []

            # Ensure the 'stiri' directory exists
            os.makedirs('stiri', exist_ok=True)

            # Read YouTube links from the log file
            with open(self.log_file) as f:
                for line in f:
                    if "youtu.be" in line:
                        youtube_ids.append(line.split("youtu.be/")[1].split("&")[0].strip())
                    elif "youtube.com/watch?v=" in line:
                        youtube_ids.append(line.split("youtube.com/watch?v=")[1].split("&")[0].strip())

            if youtube_ids:
                for video_id in youtube_ids:
                    try:
                        # Fetch the transcript
                        transcript = YouTubeTranscriptApi.get_transcript(video_id)
                    
                        # Filter and concatenate transcript text
                        script = " ".join(t["text"] for t in transcript if t["text"] != '[Music]')

                        # Save the transcript to a file
                        output_path = os.path.join('stiri', f'youtube_transcript_{video_id}.txt')
                        with open(output_path, "w", encoding='utf-8') as out_file:
                            out_file.write(script)
                    
                        print(f"Successfully processed YouTube video: {video_id}")
                    except YouTubeTranscriptApi.CouldNotRetrieveTranscript:
                        print(f"No transcript available for video ID {video_id}")
                    except Exception as e:
                        print(f"Could not process video ID {video_id}: {e}")
        except Exception as e:
            print(f"Error processing YouTube links: {e}")



    def download_article(self, article_url, index):
        """Download and extract text from news article"""
        out_file_path = os.path.join('stiri', f"news_{index}.txt")
        try:
            article = Article(article_url)
            article.download()
            article.parse()
            with open(out_file_path, 'wb') as fw:
                fw.write(article.text.encode('utf-8'))
            print(f"Downloaded: [{article_url}] [OK]")
        except Exception as e:
            print(f"Error downloading {article_url}: {e}")

    def process_articles(self):
        """Process all articles in parallel"""
        try:
            with open(self.log_file, 'r') as fr:
                urls = [(line.strip(), i) for i, line in enumerate(fr.readlines())]

            with Pool(cpu_count()) as pool:
                pool.starmap(self.download_article, urls)
        except Exception as e:
            print(f"Error processing articles: {e}")

    def is_valid_pdf(self, pdf_path):
        """Check if the PDF is valid and accessible"""
        try:
            with pikepdf.open(pdf_path) as pdf:
                return True
        except Exception:
            return False

    def pdf_to_txt(self, input_pdf, output_txt):
        """Convert PDF to text with improved error handling"""
        try:
            if not self.is_valid_pdf(input_pdf):
                print(f"Warning: {input_pdf} appears to be invalid or corrupted. Skipping...")
                return False

            text = extract_text(input_pdf)
            
            if text.strip():
                with open(output_txt, 'w', encoding='utf-8') as f:
                    f.write(text)
                print(f"Successfully extracted text from {input_pdf}")
                return True
            else:
                print(f"Warning: No text could be extracted from {input_pdf}")
                return False

        except Exception as e:
            print(f"Warning: Could not process {input_pdf}: {str(e)}")
            return False

    def extract_pdf_contents(self):
        """Extract text from all PDFs in documente folder with improved error handling"""
        try:
            pdf_files = glob.glob("documente/*.pdf")
            successful_extractions = 0
            failed_extractions = 0

            print(f"\nFound {len(pdf_files)} PDF files to process...")
            
            for pdf_file in pdf_files:
                output_file = os.path.join('stiri', f"{os.path.basename(pdf_file)}.txt")
                if self.pdf_to_txt(pdf_file, output_file):
                    successful_extractions += 1
                else:
                    failed_extractions += 1

            print(f"\nPDF Processing Summary:")
            print(f"Successfully processed: {successful_extractions}")
            print(f"Failed to process: {failed_extractions}")
            print(f"Total PDFs: {len(pdf_files)}")

        except Exception as e:
            print(f"Error in PDF extraction process: {e}")

    def split_documents(self):
        """Split documents into smaller chunks"""
        try:
            txt_files = glob.glob("stiri/*.txt")
            print(f"\nSplitting {len(txt_files)} documents...")

            for file_path in txt_files:
                try:
                    with open(file_path, 'rt', encoding='utf8', errors='ignore') as f:
                        content = f.read()
                    sentences = sent_tokenize(content)

                    buffer = ''
                    counter = 0
                    for sentence in sentences:
                        buffer += sentence
                        if len(buffer) > 1000:
                            output_path = f'data_en_split/{os.path.basename(file_path)}.{counter:03d}.txt'
                            with open(output_path, 'wt', encoding="utf-8") as f:
                                f.write(buffer)
                            buffer = ''
                            counter += 1

                    if buffer:
                        output_path = f'data_en_split/{os.path.basename(file_path)}.{counter:03d}.txt'
                        with open(output_path, 'wt', encoding="utf-8") as f:
                            f.write(buffer)
                    
                    print(f"Split {file_path} into {counter + 1} parts")
                except Exception as e:
                    print(f"Error splitting {file_path}: {e}")
                    
        except Exception as e:
            print(f"Error splitting documents: {e}")

    def filter_documents(self):
        """Remove empty and duplicate documents"""
        try:
            txt_files = glob.glob('data_en_split/*.txt')
            initial_count = len(txt_files)
            content_hash = {}
            removed_count = 0

            print("\nFiltering documents...")
            for file_path in txt_files:
                try:
                    with open(file_path, encoding='utf8', errors='ignore') as f:
                        content = f.read()

                    if len(content) == 0 or len(content) > 3000:
                        os.unlink(file_path)
                        removed_count += 1
                        continue

                    file_hash = hash(content)
                    if file_hash in content_hash:
                        os.unlink(file_path)
                        removed_count += 1
                    else:
                        content_hash[file_hash] = True
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

            print(f"Removed {removed_count} documents (empty or duplicate)")
            print(f"Remaining documents: {initial_count - removed_count}")

        except Exception as e:
            print(f"Error filtering documents: {e}")

    def process_keywords(self):
        """Process documents based on keywords"""
        try:
            def remove_accents(text):
                return unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")

            while True:
                try:
                    n = int(input("\nIntroduceti numarul de cuvinte cheie: "))
                    if n <= 0:
                        print("Va rog introduceti un numar pozitiv.")
                        continue
                    break
                except ValueError:
                    print("Va rog introduceti un numar valid.")

            keywords = []
            for i in range(n):
                keyword = input(f"Cuvant cheie {i+1}: ").strip()
                if keyword:
                    keywords.append(remove_accents(keyword))
                else:
                    print("Cuvantul cheie nu poate fi gol. Va rog reintroduceti.")
                    i -= 1

            result_path = os.path.join('rezultate', 'result.txt')
            if os.path.exists(result_path):
                os.remove(result_path)

            found_count = 0
            processed_count = 0

            print("\nCautare texte...")
            for file in os.listdir('data_en_split'):
                if file.endswith('.txt'):
                    processed_count += 1
                    try:
                        with open(os.path.join('data_en_split', file), encoding='utf-8') as f:
                            content = remove_accents(f.read())

                        if all(keyword.lower() in content.lower() for keyword in keywords):
                            with open(result_path, 'a', encoding='utf-8') as f2:
                                f2.write(content + '\n' + '-'*50 + '\n\n')
                            found_count += 1
                    except Exception as e:
                        print(f"Error processing {file}: {e}")

            print(f"\nRezultat cautare:")
            print(f"Texte procesate: {processed_count}")
            print(f"Texte gasite: {found_count}")
            print(f"Rezultatul a fost salvat în: {os.path.abspath(result_path)}")

            # Open the result file
            try:
                if os.name == 'nt':  # Windows
                    os.startfile(result_path)
                else:  # Linux/Mac
                    subprocess.run(['xdg-open', result_path])
            except Exception as e:
                print(f"Nu am putut deschide fisierul rezultat: {e}")

        except Exception as e:
            print(f"Error processing keywords: {e}")

    def run_pipeline(self):
        """Execute the complete processing pipeline"""
        original_dir = os.getcwd()
        try:
            print("\nStarting document processing pipeline...")

            print("\nStep 1: Processing input links...")
            self.remove_google_links()
            shutil.copy2(self.log_file, 'documente/logs.txt')
            
            print("\nStep 2: Downloading PDFs...")
            os.chdir('documente')
            self.download_pdfs()

            print("\nStep 3: Processing YouTube links and articles...")
            os.chdir(original_dir)
            self.process_youtube_links()
            self.process_articles()

            print("\nStep 4: Extracting text from PDFs...")
            self.extract_pdf_contents()

            print("\nStep 5: Processing documents...")
            self.split_documents()
            self.filter_documents()

            print("\nStep 6: Keyword processing...")
            self.process_keywords()

            print("\nProcessing pipeline completed successfully!")
            
        except Exception as e:
            print(f"\nError in processing pipeline: {str(e)}")
        finally:
            os.chdir(original_dir)
            print("Returned to original directory.")

if __name__ == "__main__":
    processor = UnifiedDocumentProcessor()
    processor.run_pipeline()
