from newspaper import Article
import sys
import urllib.request
import os
import zipfile
import re
import logging

import filter_duplicates_empty


def extractURLs(fileContent):
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', fileContent.lower())
    return urls


def filter_duplicates_for_current_day():
    print('Will run deduplication on folder {0}'.format(sHome))
    try:
        filter_duplicates_empty.filter_duplicates_from_directory(sHome)
    except:
        logging.warning('Deduplication failed')

    print('------- Deduplication completed  -------')


logging.basicConfig(format='%(asctime)s : %(message)s')

print('AnaZi v2.0') #just for the version info
print('')
print(sys.version)
print('')
logging.warning('Downloading the catalog of events...')
urllib.request.urlretrieve( "http://data.gdeltproject.org/events/filesizes", "filesizes")

oldLines=''
oldLen=0
oldFiles='Anazi_oldfiles'
try:
    of=open(oldFiles, 'r')
except OSError:
    logging.warning('Cannot open the old catalog. Does not exist?')
else:
    oldLines=of.readlines()
    oldLen=len(oldLines)
    of.close()

lLines=''
nLen=0
try:
    f=open('filesizes', 'r')
except OSError:
    logging.warning('Cannot open "filesizes". No connection?')
else:
    lLines=f.readlines()
    nLen=len(lLines)
    f.close()

nSkipDays = 1 #This is the number of days CLOSE to TODAY which we skip. '1' means 'we skip the current day'

nDiff=nLen-oldLen


while nDiff>nSkipDays: # aha! something added!
    s=lLines[-nDiff].strip()
    if s!='':   #extract the CSV from the ZIP
        sFileName=s.split()[1] #just the file name - no sizes
        logging.warning(sFileName)

        logging.warning('Downloading...')
        urllib.request.urlretrieve( "http://data.gdeltproject.org/events/"+sFileName, sFileName)

        logging.warning('Extracting...')
        crtZip=zipfile.ZipFile(sFileName, 'r')
        crtZip.extractall()
        crtZip.close()
        os.remove(sFileName)

        logging.warning('Processing the CSV...')
        sCSV=sFileName.replace('.zip','')
        fCSV = open(sCSV, 'r', encoding="utf-8")
        fileContent = fCSV.read()
        fCSV.close()
        os.remove(sCSV)

        logging.warning('Processing the URLs...')
        URLs = extractURLs(fileContent)
        #yet another filter based on extensions
        lURLs=[]
        for item in URLs:
            if item.endswith('.html') or item.endswith('.htm') or item.endswith('.asp') or item.endswith('.aspx') or item.endswith('.php'):
                lURLs.append(item) #valid URL

        lURLs=list(set(lURLs)) #remove duplicates
        lURLs.sort() #to have the same order for the same day; useful for resuming
        logging.warning('We got '+str(len(lURLs))+' URLs')

        logging.warning('Downloading the URLs...')
        #make home
        sRoot=os.getcwd() #on Win is the current path. On Linux it can be somewhere else (shared etc.)
        sHome=os.getcwd()+'/'+sCSV.split('.')[0]  #that is the datecode taken from the filename
        try:
            sMsg='Try to create the folder '+sHome
            os.mkdir(sHome)
            sMsg=sMsg+' [OK]'
        except:
            sMsg=sMsg+' [ERROR] (Already exists? If yes, then we will resume the download)'
            pass
        logging.warning(sMsg)
        
        #actually download
        logging.warning('Downloading articles...')
        for i, line in enumerate(lURLs):
            try:
                bOk=True
                sArticle=''
                sArticleFile=sHome+'/news_'+str(i+1)+'.txt'
                urlNews=line.strip()
                result = '#'+str(i+1)+': [' + urlNews + '] ...'
                if os.path.isfile(sArticleFile):
                    result = result + '[EXISTS. SKIP]'
                    bOk=False
                else:
                    article = Article(url=urlNews, fetch_images=False)
                    article.download()
                    article.parse()
                    sArticle=article.text.encode('utf-8')
                    
                    result = result + '[OK]'
                    #fw.write(article.publish_date)
                    #fw.write(article.authors)
                    #logging.warning(article.authors+' : '+article.publish_date)
            except:
                bOk=False
                result = result + '[ERROR]'
                
            if bOk:
                with open(sArticleFile, 'wb') as fw:
                    fw.write(sArticle)
                    fw.close()
                    
            logging.warning(result)
        #everything is fine till here. Add the line to old files
        try:
            of=open(oldFiles, 'a')
        except OSError:
            logging.warning('Cannot open the old catalog. Bad. Cannot write?')
        else:
            of.write("\n")
            of.write(s)
            of.close()

    print('--------------')
    filter_duplicates_for_current_day()
    nDiff=nDiff-1

else:
    logging.warning('Work done!')

#if everything was ok (that is no Ctrl+C), then update the oldfiles - OBSOLETE
#if os.path.isfile(oldFiles):
#    os.remove(oldFiles)
#os.rename('filesizes', oldFiles)
