import urllib.request
import re
from datetime import datetime
from multiprocessing import cpu_count, Pool
from newspaper import Article
import sys
import os
import zipfile
import logging
import filter_duplicates_empty

# Functie pentru extragerea numelor de fisiere dintr-un interval de date
def extract_file_names(start_date, end_date):
    data_url = "http://data.gdeltproject.org/events/index.html"
    try:
        with urllib.request.urlopen(data_url) as response:
            html_content = response.read().decode('utf-8')
    except Exception as e:
        print(f"Error accessing URL: {e}")
        return []

    file_names = re.findall(r'\d{8}.export.CSV.zip', html_content)
    filtered_files = [name for name in file_names if start_date <= datetime.strptime(name[:8], '%Y%m%d') <= end_date]
    return filtered_files

def extractURLs(fileContent):
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', fileContent.lower())
    return urls

def filter_duplicates_for_current_day():
    print('Will run deduplication on folder {0}'.format(sHome))
    try:
        filter_duplicates_empty.filter_duplicates_from_directory(sHome)
    except:
        logging.warning('Deduplication failed')

    print('------- Deduplication completed  -------')

def download_article(article_url, index, out_file_path):
    with open(out_file_path, 'wb') as fw:
        try:
            message = f"Downloading: [{article_url}] .."
            article_ = Article(article_url)
            article_.download()
            article_.parse()
            fw.write(article_.text.encode('utf-8'))
            message = message + '[OK]'
        except:
            message = message + '[ERROR]'
        finally:
            print(message)

def main():
    global sRoot, sHome
    logging.basicConfig(format='%(asctime)s : %(message)s')

    # Solicitarea datelor de la utilizator
    start_date_str = input("Enter start date (YYYYMMDD): ")
    end_date_str = input("Enter end date (YYYYMMDD): ")

    try:
        start_date = datetime.strptime(start_date_str, '%Y%m%d')
        end_date = datetime.strptime(end_date_str, '%Y%m%d')
    except ValueError:
        print("Invalid date format. Please use YYYYMMDD format.")
        return

    print(f"Processing from {start_date} to {end_date}")

    # Extragerea numelor fișierelor
    file_names = extract_file_names(start_date, end_date)

    for sFileName in file_names:
        logging.warning('Processing file: ' + sFileName)
        # Descarcarea fisierului
        try:
            urllib.request.urlretrieve("http://data.gdeltproject.org/events/" + sFileName, sFileName)
        except Exception as e:
            logging.warning(f"Failed to download {sFileName}: {e}")
            continue

        # Extragerea fisierului
        try:
            with zipfile.ZipFile(sFileName, 'r') as zip_ref:
                zip_ref.extractall()
        except Exception as e:
            logging.warning(f"Failed to extract {sFileName}: {e}")
            continue
        finally:
            os.remove(sFileName)

        # Procesarea fisierului CSV
        sCSV = sFileName.replace('.zip', '')
        try:
            with open(sCSV, 'r', encoding="utf-8") as fCSV:
                fileContent = fCSV.read()
        except Exception as e:
            logging.warning(f"Failed to read {sCSV}: {e}")
            continue
        finally:
            os.remove(sCSV)

        URLs = extractURLs(fileContent)
        lURLs = [url for url in URLs if url.endswith(('.html', '.htm', '.asp', '.aspx', '.php'))]
        lURLs = list(set(lURLs))
        lURLs.sort()

        sRoot = os.getcwd()
        sHome = os.path.join(sRoot, sCSV.split('.')[0])
        if not os.path.exists(sHome):
            os.mkdir(sHome)

        params_list = [(url, i, os.path.join(sHome, f'news_{i+1}.txt')) for i, url in enumerate(lURLs) if not os.path.isfile(os.path.join(sHome, f'news_{i+1}.txt'))]

        num_cpus = cpu_count()
        with Pool(num_cpus) as pool:
            pool.starmap(download_article, params_list)

        filter_duplicates_for_current_day()

    print('Process completed.')

if __name__ == '__main__':
    main()
