import requests
import re
import json
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs
import logging

# Configuration du logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class VideoScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        })

    def extract_duration_from_meta(self, soup):
        """Extrait la durée depuis les métadonnées"""
        # Pattern 1: Meta property video:duration
        meta_duration = soup.find('meta', property='video:duration')
        if meta_duration:
            seconds = int(meta_duration.get('content', 0))
            return self.format_duration(seconds)
        
        # Pattern 2: Meta property og:video:duration
        og_duration = soup.find('meta', property='og:video:duration')
        if og_duration:
            seconds = int(og_duration.get('content', 0))
            return self.format_duration(seconds)
        
        # Pattern 3: JSON-LD structured data
        json_ld_scripts = soup.find_all('script', type='application/ld+json')
        for script in json_ld_scripts:
            try:
                data = json.loads(script.string)
                if isinstance(data, dict):
                    duration = data.get('duration') or data.get('timeRequired')
                    if duration:
                        return self.parse_duration_string(duration)
            except:
                continue
        
        return None

    def extract_duration_from_json(self, html_content):
        """Extrait la durée depuis les blobs JSON dans le HTML"""
        # Patterns pour trouver la durée dans les données JSON
        duration_patterns = [
            r'"duration":\s*"?(\d+)"?',
            r'"timeRequired":\s*"?(\d+)"?',
            r'"lengthSeconds":\s*"?(\d+)"?',
            r'"videoDuration":\s*"?(\d+)"?',
            r'"durationSeconds":\s*"?(\d+)"?',
            r'"length":\s*"?(\d+)"?',
            r'"time":\s*"?(\d+)"?'
        ]
        
        for pattern in duration_patterns:
            matches = re.findall(pattern, html_content, re.IGNORECASE)
            if matches:
                try:
                    seconds = int(matches[0])
                    return self.format_duration(seconds)
                except:
                    continue
        
        return None

    def format_duration(self, seconds):
        """Convertit les secondes en format HH:MM:SS"""
        hours = seconds // 3600
        minutes = (seconds % 3600) // 60
        secs = seconds % 60
        return f"{hours:02d}:{minutes:02d}:{secs:02d}"

    def parse_duration_string(self, duration_str):
        """Parse une chaîne de durée (ex: "PT1M30S")"""
        if not duration_str:
            return None
        
        # Format ISO 8601 (PT1M30S)
        if duration_str.startswith('PT'):
            duration_str = duration_str[2:]
            hours = 0
            minutes = 0
            seconds = 0
            
            # Extraire les heures
            if 'H' in duration_str:
                hours_part = duration_str.split('H')[0]
                hours = int(hours_part)
                duration_str = duration_str.split('H')[1]
            
            # Extraire les minutes
            if 'M' in duration_str:
                minutes_part = duration_str.split('M')[0]
                minutes = int(minutes_part)
                duration_str = duration_str.split('M')[1]
            
            # Extraire les secondes
            if 'S' in duration_str:
                seconds_part = duration_str.split('S')[0]
                seconds = int(seconds_part)
            
            total_seconds = hours * 3600 + minutes * 60 + seconds
            return self.format_duration(total_seconds)
        
        return None

    def scrape_instagram(self, url):
        """Scrape une vidéo Instagram en utilisant selenium"""
        from selenium import webdriver
        from selenium.webdriver.chrome.options import Options
        from selenium.webdriver.chrome.service import Service
        from selenium.webdriver.common.by import By
        from selenium.webdriver.support.ui import WebDriverWait
        from selenium.webdriver.support import expected_conditions as EC
        from webdriver_manager.chrome import ChromeDriverManager
        
        try:
            logger.info(f"🔍 Scraping Instagram: {url}")
            
            # Configuration de Chrome en mode headless
            chrome_options = Options()
            chrome_options.add_argument('--headless')
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--disable-dev-shm-usage')
            chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
            
            # Initialiser le driver avec webdriver_manager
            service = Service(ChromeDriverManager().install())
            driver = webdriver.Chrome(service=service, options=chrome_options)
            driver.get(url)
            
            # Attendre le chargement de la page
            wait = WebDriverWait(driver, 10)
            
            # Plusieurs méthodes pour trouver la description
            methods = [
                # Méthode 1: Meta description
                lambda: wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'meta[property="og:description"]'))).get_attribute('content'),
                
                # Méthode 2: Article text
                lambda: wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'article'))).text,
                
                # Méthode 3: Span dans article
                lambda: wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'article span'))).text,
                
                # Méthode 4: Div avec rôle de description
                lambda: wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '[role="menuitem"]'))).text,
                
                # Méthode 5: Nouveau sélecteur Instagram pour les descriptions
                lambda: wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '._a9zc'))).text,
                
                # Méthode 6: Autre sélecteur Instagram pour les descriptions
                lambda: wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'h1._ap3a._aaco._aacu._aacx._aad6._aade'))).text,
                
                # Méthode 7: Nouveau sélecteur pour les posts
                lambda: wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div._a9zs'))).text,
                
                # Méthode 8: Sélecteur pour le conteneur de légende
                lambda: wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div._ae5q._ae5r._ae5s'))).text,
            ]
            
            description = None
            for method in methods:
                try:
                    description = method()
                    if description:
                        logger.info(f"✅ Description trouvée: {description[:100]}...")
                        break
                except:
                    continue
            
            if not description:
                description = "Description non trouvée"
            
            # Extraire la date de publication Instagram depuis la description
            published_date = "Non trouvée"
            creator_pseudo = "Instagram"  # Valeur par défaut
            
            # Extraire le pseudo du créateur depuis la description
            if description:
                try:
                    import re
                    # Pattern pour extraire le pseudo : "- pseudo on Date:"
                    pseudo_match = re.search(r'-\s+([a-zA-Z0-9_.]+)\s+on\s+[A-Za-z]+ \d{1,2}, \d{4}:', description)
                    if pseudo_match:
                        creator_pseudo = pseudo_match.group(1)
                        logger.info(f"✅ Pseudo créateur trouvé: {creator_pseudo}")
                except Exception as e:
                    logger.debug(f"Erreur extraction pseudo: {e}")
            
            if description and "on " in description:
                try:
                    import re
                    from datetime import datetime
                    match = re.search(r'on ([A-Za-z]+ \d{1,2}, \d{4}):', description)
                    if match:
                        date_str = match.group(1)
                        date_obj = datetime.strptime(date_str, '%B %d, %Y')
                        published_date = date_obj.strftime('%d/%m/%Y')
                        logger.info(f"✅ Date extraite de la description: {published_date}")
                except Exception as e:
                    logger.debug(f"Erreur extraction date: {e}")
            
            # Si pas de date trouvée, utiliser la date d'aujourd'hui
            if published_date == "Non trouvée":
                from datetime import datetime
                today = datetime.now()
                published_date = today.strftime('%d/%m/%Y')
                logger.warning(f"⚠️ Aucune date trouvée, utilisation de la date d'aujourd'hui: {published_date}")
                
            try:
                # Durée de la vidéo
                video_elem = wait.until(
                    EC.presence_of_element_located((By.TAG_NAME, "video"))
                )
                duration = str(round(float(video_elem.get_attribute("duration"))))
                duration = self.format_duration(int(duration))
            except:
                duration = "Non trouvée"
                
            # Fermer le driver
            driver.quit()
            
            return {
                'url': url,
                'description': description,
                'duration': duration,
                'published_date': published_date,
                'platform': creator_pseudo  # Utiliser le pseudo au lieu de 'Instagram'
            }
            
        except Exception as e:
            logger.error(f"❌ Erreur scraping Instagram: {e}")
            return {
                'url': url,
                'description': 'Erreur lors du scraping',
                'duration': 'Non trouvée',
                'published_date': 'Non trouvée',
                'platform': 'Instagram'  # Garder 'Instagram' en cas d'erreur
            }

    def scrape_youtube(self, url):
        """Scrape une vidéo YouTube en utilisant yt-dlp"""
        import yt_dlp
        from datetime import datetime
        
        try:
            logger.info(f"🔍 Scraping YouTube: {url}")
            
            # Configuration de yt-dlp
            ydl_opts = {
                'quiet': True,
                'no_warnings': True,
                'extract_flat': True
            }
            
            # Extraire les informations de la vidéo
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                video_info = ydl.extract_info(url, download=False)
                
                # Extraire la description
                description = video_info.get('description', '')
                if not description:
                    description = video_info.get('title', 'Description non trouvée')
                
                # Extraire la durée
                duration_seconds = video_info.get('duration')
                if duration_seconds:
                    duration = self.format_duration(duration_seconds)
                else:
                    duration = "Non trouvée"
                
                # Extraire la date de publication
                upload_date = video_info.get('upload_date')
                published_date = "Non trouvée"
                if upload_date:
                    try:
                        # Convertir YYYYMMDD en format lisible
                        date_obj = datetime.strptime(upload_date, '%Y%m%d')
                        published_date = date_obj.strftime('%d/%m/%Y')
                    except:
                        # Fallback avec timestamp
                        timestamp = video_info.get('timestamp')
                        if timestamp:
                            date_obj = datetime.fromtimestamp(timestamp)
                            published_date = date_obj.strftime('%d/%m/%Y')
                
                return {
                    'url': url,
                    'description': description,
                    'duration': duration,
                    'published_date': published_date,
                    'platform': 'YouTube'
                }
        except Exception as e:
            logger.error(f"❌ Erreur scraping YouTube: {e}")
            return {
                'url': url,
                'description': f'Erreur lors du scraping : {str(e)}',
                'duration': 'Non trouvée',
                'published_date': 'Non trouvée',
                'platform': 'YouTube'
            }

    def scrape_video(self, url):
        """Scrape une vidéo selon la plateforme détectée"""
        parsed_url = urlparse(url)
        domain = parsed_url.netloc.lower()
        
        if 'instagram.com' in domain:
            return self.scrape_instagram(url)
        elif 'youtube.com' in domain or 'youtu.be' in domain:
            return self.scrape_youtube(url)
        else:
            logger.warning(f"⚠️ Plateforme non supportée: {domain}")
            return {
                'url': url,
                'description': 'Plateforme non supportée',
                'duration': 'Non trouvée',
                'published_date': 'Non trouvée',
                'platform': 'Inconnue'
            }