from multiprocessing import cpu_count, Pool

from newspaper import Article
import sys
import urllib.request
import os
import zipfile
import re
import logging

import filter_duplicates_empty

# make home
sRoot = None
sHome = None

def extractURLs(fileContent):
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', fileContent.lower())
    return urls


def filter_duplicates_for_current_day():
    print('Will run deduplication on folder {0}'.format(sHome))
    try:
        filter_duplicates_empty.filter_duplicates_from_directory(sHome)
    except:
        logging.warning('Deduplication failed')

    print('------- Deduplication completed  -------')

def download_article(article_url, index, out_file_path):
    with open(out_file_path, 'wb') as fw:
        try:
            message = f"Downloading: [{article_url}] .."
            article_ = Article(article_url)
            article_.download()
            article_.parse()
            fw.write(article_.text.encode('utf-8'))
            message = message + '[OK]'
        except:
            message = message + '[ERROR]'
        finally:
            print(message)

def main():
    global sRoot, sHome
    logging.basicConfig(format='%(asctime)s : %(message)s')

    print('AnaZi v2.0') #just for the version info
    print('')
    print(sys.version)
    print('')
    logging.warning('Downloading the catalog of events...')
    urllib.request.urlretrieve( "http://data.gdeltproject.org/events/filesizes", "filesizes")

    oldLines=''
    oldLen=0
    oldFiles='Anazi_oldfiles'
    try:
        of=open(oldFiles, 'r')
    except OSError:
        logging.warning('Cannot open the old catalog. Does not exist?')
    else:
        oldLines=of.readlines()
        oldLen=len(oldLines)
        of.close()

    lLines=''
    nLen=0
    try:
        f=open('filesizes', 'r')
    except OSError:
        logging.warning('Cannot open "filesizes". No connection?')
    else:
        lLines=f.readlines()
        nLen=len(lLines)
        f.close()

    nSkipDays = 1 #This is the number of days CLOSE to TODAY which we skip. '1' means 'we skip the current day'

    nDiff=nLen-oldLen

    while nDiff>nSkipDays: # aha! something added!
        s=lLines[-nDiff].strip()
        if s!='':   #extract the CSV from the ZIP
            sFileName=s.split()[1] #just the file name - no sizes
            logging.warning(sFileName)

            logging.warning('Downloading...')
            urllib.request.urlretrieve( "http://data.gdeltproject.org/events/"+sFileName, sFileName)

            logging.warning('Extracting...')
            crtZip=zipfile.ZipFile(sFileName, 'r')
            crtZip.extractall()
            crtZip.close()
            os.remove(sFileName)

            logging.warning('Processing the CSV...')
            sCSV=sFileName.replace('.zip','')
            fCSV = open(sCSV, 'r', encoding="utf-8")
            fileContent = fCSV.read()
            fCSV.close()
            os.remove(sCSV)

            logging.warning('Processing the URLs...')
            URLs = extractURLs(fileContent)
            #yet another filter based on extensions
            lURLs=[]
            for item in URLs:
                if item.endswith('.html') or item.endswith('.htm') or item.endswith('.asp') or item.endswith('.aspx') or item.endswith('.php'):
                    lURLs.append(item) #valid URL

            lURLs=list(set(lURLs)) #remove duplicates
            lURLs.sort() #to have the same order for the same day; useful for resuming
            logging.warning('We got '+str(len(lURLs))+' URLs')

            logging.warning('Downloading the URLs...')
            #make home
            sRoot=os.getcwd() #on Win is the current path. On Linux it can be somewhere else (shared etc.)
            sHome=os.getcwd()+'/'+sCSV.split('.')[0]  #that is the datecode taken from the filename
            try:
                sMsg='Try to create the folder '+sHome
                os.mkdir(sHome)
                sMsg=sMsg+' [OK]'
            except:
                sMsg=sMsg+' [ERROR] (Already exists? If yes, then we will resume the download)'
                pass
            logging.warning(sMsg)

            #actually download
            logging.warning('Downloading articles...')
            params_list = []
            for i, line in enumerate(lURLs):
                try:
                    bOk=True
                    sArticle=''
                    sArticleFile=sHome+'/news_'+str(i+1)+'.txt'
                    urlNews=line.strip()
                    result = '#'+str(i+1)+': [' + urlNews + '] ...'
                    if os.path.isfile(sArticleFile):
                        result = result + '[EXISTS. SKIP]'
                        bOk=False
                    else:
                        params_list.append((urlNews, i, sArticleFile))
                except:
                    bOk=False
                    result = result + '[ERROR]'

            num_cpus = cpu_count()
            print(f"We have {num_cpus} CPUs. Downloading..")
            with Pool(num_cpus) as pool:
                pool.starmap(download_article, params_list)

            try:
                of=open(oldFiles, 'a')
            except OSError:
                logging.warning('Cannot open the old catalog. Bad. Cannot write?')
            else:
                of.write("\n")
                of.write(s)
                of.close()

        print('--------------')
        filter_duplicates_for_current_day()
        nDiff=nDiff-1

    else:
        logging.warning('Work done!')

if __name__ == '__main__':
    main()

