import requests
from bs4 import BeautifulSoup
import re
import os

def make_absolute_url(base_url, relative_url):
    if relative_url.startswith('http'):
        return relative_url
    return f"{base_url.rstrip('/')}/{relative_url.lstrip('/')}"

def get_pdf_links_from_intermediate_page(url, base_url):
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        pdf_links = soup.find_all('a', href=re.compile(r'\.pdf$'))
        return [make_absolute_url(base_url, link['href']) for link in pdf_links]
    except requests.RequestException as e:
        print(f"Eroare la accesarea URL-ului {url}: {e}")
        return []

def get_intermediate_page_links(main_url):
    try:
        page = requests.get(main_url)
        soup = BeautifulSoup(page.content, 'html.parser')
        intermediate_links = soup.find_all('a', href=True)
        return [link['href'] for link in intermediate_links if 'document' in link['href']]
    except requests.RequestException as e:
        print(f"Eroare la accesarea URL-ului principal {main_url}: {e}")
        return []

def download_pdfs(pdf_links, folder_name):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    for url in pdf_links:
        try:
            response = requests.get(url)
            filename = url.split('/')[-1]
            filepath = os.path.join(folder_name, filename)
            with open(filepath, 'wb') as file:
                file.write(response.content)
            print(f"Descărcat: {filename}")
        except requests.RequestException as e:
            print(f"Eroare la descărcarea fișierului {url}: {e}")

def main():
    main_url = input("Introduceți URL-ul paginii principale: ").strip()
    base_url = 'https://nsarchive.gwu.edu'
    intermediate_page_links = get_intermediate_page_links(main_url)

    all_pdf_links = []
    for link in intermediate_page_links:
        full_link = make_absolute_url(base_url, link)
        pdf_links = get_pdf_links_from_intermediate_page(full_link, base_url)
        all_pdf_links.extend(pdf_links)

    if all_pdf_links:
        with open('logs.txt', 'w') as file:
            for link in all_pdf_links:
                file.write(link + '\n')
        print("Linkurile PDF au fost salvate în logs.txt")
        download_pdfs(all_pdf_links, 'documente_gov')
    else:
        print("Niciun link PDF nu a fost găsit pentru salvare.")

if __name__ == "__main__":
    main()
