import requests
import gzip
import os
import re
import cv2
import shutil
import mediapipe as mp
import json
import time
import tempfile
import webbrowser
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import numpy as np

# MediaPipe setup for face detection
mp_face_detection = mp.solutions.face_detection

# Constants
MAX_RETRIES = 3
CHUNK_SIZE = 10  # Number of WET files to process in one batch
PROGRESS_FILE = 'progress.json'
BASE_IMAGE_FOLDER = 'imgbaza'
SIMILARITY_THRESHOLD = 0.8  # Adjust this value as needed

def safe_delete(filename, max_retries=5, delay=1):
    for i in range(max_retries):
        try:
            os.unlink(filename)
            return
        except PermissionError:
            if i < max_retries - 1:
                time.sleep(delay)
            else:
                print(f"Warning: Could not delete temporary file {filename}")

def download_file(url, local_filename, retries=MAX_RETRIES):
    for attempt in range(retries):
        try:
            with requests.get(url, stream=True, timeout=30) as r:
                r.raise_for_status()
                total_size = int(r.headers.get('content-length', 0))
                block_size = 8192
                with open(local_filename, 'wb') as f, tqdm(
                    desc=f"Downloading {os.path.basename(local_filename)}",
                    total=total_size,
                    unit='iB',
                    unit_scale=True,
                    unit_divisor=1024,
                ) as progress_bar:
                    for data in r.iter_content(block_size):
                        size = f.write(data)
                        progress_bar.update(size)
            return True
        except Exception as e:
            print(f"Error downloading file {url}: {e}. Attempt {attempt + 1} of {retries}")
    return False

def get_wet_file_paths():
    print("Fetching list of WET files...")
    url = "https://data.commoncrawl.org/crawl-data/index.html"
    response = requests.get(url)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    datasets = {}
    rows = soup.find_all('tr')
    for row in rows:
        columns = row.find_all('td')
        if len(columns) == 4:
            dataset_id = columns[0].text.strip()
            datasets[dataset_id] = True

    selected_id = next(iter(datasets))
    print(f"Selected dataset: {selected_id}")
    
    dataset_url = f"https://data.commoncrawl.org/crawl-data/{selected_id}/wet.paths.gz"
    
    with tempfile.NamedTemporaryFile(delete=False, suffix='.gz') as temp_file:
        if download_file(dataset_url, temp_file.name):
            with gzip.open(temp_file.name, 'rt') as f:
                wet_file_paths = f.read().splitlines()
            print(f"Found {len(wet_file_paths)} WET files.")
            safe_delete(temp_file.name)
            return wet_file_paths
    
    print("Failed to download WET file paths.")
    return []

def extract_jpg_links(text):
    pattern = r'https?://\S+\.jpg'
    return re.findall(pattern, text)

def process_wet_file(wet_file_path):
    print(f"\nProcessing WET file: {wet_file_path}")
    url = f"https://data.commoncrawl.org/{wet_file_path}"
    
    with tempfile.NamedTemporaryFile(delete=False, suffix='.wet.gz') as temp_file:
        if download_file(url, temp_file.name):
            jpg_links = []
            try:
                open_func = gzip.open if temp_file.name.endswith('.gz') else open
                with open_func(temp_file.name, 'rt', encoding='utf-8') as f:
                    for line in tqdm(f, desc="Processing lines", unit="line"):
                        jpg_links.extend(extract_jpg_links(line))
                print(f"Found {len(jpg_links)} JPG links in {os.path.basename(temp_file.name)}")
            except Exception as e:
                print(f"Error processing WET file {os.path.basename(temp_file.name)}: {e}")
            finally:
                safe_delete(temp_file.name)
            return jpg_links
    return []

def compare_images(img1, img2):
    # Convert images to grayscale
    gray1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
    gray2 = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)

    # Resize images to the same size
    gray1 = cv2.resize(gray1, (200, 200))
    gray2 = cv2.resize(gray2, (200, 200))

    # Compare images using structural similarity index
    similarity = cv2.matchTemplate(gray1, gray2, cv2.TM_CCOEFF_NORMED)[0][0]
    return similarity

def load_base_images():
    base_images = []
    for filename in os.listdir(BASE_IMAGE_FOLDER):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
            img_path = os.path.join(BASE_IMAGE_FOLDER, filename)
            img = cv2.imread(img_path)
            if img is not None:
                base_images.append(img)
    return base_images

def download_and_process_image(link, base_images):
    with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
        try:
            if download_file(link, temp_file.name):
                img = cv2.imread(temp_file.name)
                if img is not None:
                    with mp_face_detection.FaceDetection(model_selection=1, min_detection_confidence=0.6) as face_detection:
                        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                        results = face_detection.process(img_rgb)
                        
                        if results.detections:
                            for base_img in base_images:
                                similarity = compare_images(img, base_img)
                                if similarity > SIMILARITY_THRESHOLD:
                                    print(f"Match found! Similarity: {similarity}")
                                    print(f"Matching image URL: {link}")
                                    webbrowser.open(link)
                                    return True  # Signal to stop processing
                            
                            output_folder = 'humanfacial'
                            os.makedirs(output_folder, exist_ok=True)
                            output_filename = os.path.join(output_folder, os.path.basename(link))
                            shutil.move(temp_file.name, output_filename)
                            print(f'Moved image with face: {os.path.basename(link)}')
                        else:
                            print(f'No faces found in: {os.path.basename(link)}')
                else:
                    print(f'Could not process image: {os.path.basename(link)}')
        except Exception as e:
            print(f"Error processing image {os.path.basename(link)}: {e}")
        finally:
            safe_delete(temp_file.name)
    return False  # Continue processing

def save_progress(processed_files):
    with open(PROGRESS_FILE, 'w') as f:
        json.dump(processed_files, f)

def load_progress():
    if os.path.exists(PROGRESS_FILE):
        with open(PROGRESS_FILE, 'r') as f:
            return json.load(f)
    return []

def main():
    wet_file_paths = get_wet_file_paths()
    processed_files = load_progress()
    base_images = load_base_images()
    
    print(f"\nTotal WET files to process: {len(wet_file_paths)}")
    print(f"Already processed: {len(processed_files)}")
    print(f"Loaded {len(base_images)} base images for comparison")
    
    if processed_files:
        choice = input("Do you want to resume from where you left off? (yes/no): ").strip().lower()
        if choice != 'yes':
            processed_files = []
            print("Starting from the beginning.")
        else:
            print("Resuming from the last processed file.")
    
    start_time = time.time()
    
    for i in tqdm(range(0, len(wet_file_paths), CHUNK_SIZE), desc="Processing WET file chunks"):
        chunk = wet_file_paths[i:i+CHUNK_SIZE]
        for wet_file_path in chunk:
            if wet_file_path in processed_files:
                print(f"Skipping already processed file: {wet_file_path}")
                continue
            
            jpg_links = process_wet_file(wet_file_path)
            
            print(f"Processing {len(jpg_links)} JPG links from {wet_file_path}")
            with ThreadPoolExecutor() as executor:
                futures = [executor.submit(download_and_process_image, link, base_images) for link in jpg_links]
                for future in as_completed(futures):
                    if future.result():
                        print("Match found. Stopping the process.")
                        return
            
            processed_files.append(wet_file_path)
            save_progress(processed_files)
        
        elapsed_time = time.time() - start_time
        files_processed = i + len(chunk)
        files_remaining = len(wet_file_paths) - files_processed
        avg_time_per_file = elapsed_time / files_processed if files_processed > 0 else 0
        estimated_time_remaining = avg_time_per_file * files_remaining
        
        print(f"\nProgress: {files_processed}/{len(wet_file_paths)} WET files processed")
        print(f"Elapsed time: {elapsed_time:.2f} seconds")
        print(f"Estimated time remaining: {estimated_time_remaining:.2f} seconds")
        
    print("\nProcessing complete!")
    print(f"Total elapsed time: {time.time() - start_time:.2f} seconds")
    print(f"Total WET files processed: {len(processed_files)}")

if __name__ == "__main__":
    main()
