import argparse
import glob
import os


def filter_duplicates(text_files):
    content_hash = {}
    for file_path in text_files:
        content = open(file_path, encoding='utf8', errors='ignore').read()
        if len(content) == 0:
            print(f'empty: {file_path}')
            os.unlink(file_path)
            continue

        file_hash = hash(content)
        if file_hash in content_hash:
            print(f'duplicate: {file_path}')
            os.unlink(file_path)

        content_hash[file_hash] = True


def filter_duplicates_from_directory(target_directory):
    text_files = glob.glob(f'{target_directory}/*.txt')
    filter_duplicates(text_files)


parser = argparse.ArgumentParser(description='Athos article filter')
parser.add_argument(
    '--in_dir',
    type=str,
    default='stiri',
    required=False,
)
args = parser.parse_args()

filter_duplicates_from_directory(args.in_dir)