from youtube_transcript_api import YouTubeTranscriptApi
from pytube import YouTube
import os
import re
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

def write_transcript_to_file(video_id, dir_name):
    # Get the video title
    video = YouTube(f"https://www.youtube.com/watch?v={video_id}")
    title = video.title

    # Clean the title to make it file-name safe
    clean_title = "".join(c for c in title if c.isalnum() or c.isspace())

    transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

    transcript = transcript_list.find_transcript(['en', 'ro'])

    # fetch the actual transcript data
    transcript_data = transcript.fetch()

    # translating the transcript will return another transcript object
    translated_transcript = transcript.translate('ro').fetch()

    # write the translated transcript to a file
    with open(os.path.join(dir_name, clean_title + '.txt'), 'w', encoding='utf-8') as f:
        for entry in translated_transcript:
            f.write(entry['text'] + '\n')

def extract_video_id_from_url(url):
    # find the video id in the url (it's the part after "v=")
    match = re.search(r'v=([^&]*)', url)
    if match:
        return match.group(1)
    else:
        raise ValueError(f'Could not extract video ID from URL: {url}')

def process_video_url(video_url):
    video_id = extract_video_id_from_url(video_url)
    write_transcript_to_file(video_id, dir_name)

# read the video urls from the file
with open('logs.txt', 'r') as f:
    video_urls = f.read().splitlines()

# output directory
dir_name = 'romana'

# make sure the output directory exists
if not os.path.exists(dir_name):
    os.makedirs(dir_name)

# use a ThreadPoolExecutor to process the videos in parallel
with ThreadPoolExecutor() as executor:
    list(tqdm(executor.map(process_video_url, video_urls), total=len(video_urls)))