import re
import pandas as pd
from PyPDF2 import PdfReader

def extract_emails_from_pdfs(file_paths, output_path):
    """
    Extracts email addresses from a list of PDF files and saves them to an Excel or TXT file.
    For emails where the domain part (after @) doesn't contain a dot, only keeps the local part.
    Keeps all occurrences of email addresses, including duplicates.
    
    Args:
        file_paths (list): List of paths to PDF files to be processed.
        output_path (str): Path where the resulting file will be saved.
    """
    # Regex pattern to capture email addresses
    email_pattern = r'[\w\.-]+@[^\s,;:<>"\']*'
    # Pattern to validate domain format (must contain at least one dot)
    domain_pattern = r'^[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
    
    # DataFrame to store the extracted email addresses
    email_data = pd.DataFrame(columns=['Email', 'File', 'Page'])

    # Extract email addresses from each PDF file
    for file_path in file_paths:
        try:
            reader = PdfReader(file_path)
            for page_number, page in enumerate(reader.pages, start=1):
                text = page.extract_text()
                if text:
                    emails = re.findall(email_pattern, text)
                    for email in emails:  # Keep duplicates
                        cleaned_email = email.strip().replace('\n', '').replace('\r', '').replace('\t', '')
                        cleaned_email = re.sub(r'[\s,;:<>"\']+$', '', cleaned_email)
                        
                        # Check if there's content after @
                        if '@' in cleaned_email:
                            local_part, domain = cleaned_email.split('@', 1)
                            # If domain matches required format, keep full email
                            # Otherwise, keep only the local part with @
                            if re.match(domain_pattern, domain):
                                final_email = cleaned_email
                            else:
                                final_email = local_part + '@'
                                
                            email_data = pd.concat([email_data, pd.DataFrame({
                                'Email': [final_email],
                                'File': [file_path.split('/')[-1]],
                                'Page': [page_number]
                            })], ignore_index=True)
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")
    
    if output_path.endswith('.xlsx'):
        # Export to Excel
        email_data.to_excel(output_path, index=False)
    else:
        # Export to TXT
        with open(output_path, 'w', encoding='utf-8') as f:
            for index, row in email_data.iterrows():
                f.write(f"{row['Email']}, {row['File']}, {row['Page']}\n")
    
    print(f"Extraction complete. Emails saved to {output_path}")

if __name__ == "__main__":
    import sys
    from tkinter import Tk
    from tkinter.filedialog import askopenfilenames, asksaveasfilename
    
    Tk().withdraw()  # Hide the root window
    
    # Let the user select PDF files
    file_paths = askopenfilenames(title="Select PDF Files", filetypes=[("PDF Files", "*.pdf")])
    
    if not file_paths:
        print("No files selected.")
        sys.exit()
    
    # Let the user select the output file
    output_path = asksaveasfilename(title="Save As", defaultextension=".xlsx", filetypes=[("Excel File", "*.xlsx"), ("Text File", "*.txt")])
    
    if not output_path:
        print("No output file selected.")
        sys.exit()
    
    extract_emails_from_pdfs(file_paths, output_path)