import requests
import re
import json
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs
import logging

# Configuration du logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class VideoScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Geck                    if published_date == "Non trouvée":
                        from datetime import datetime
                        today = datetime.now()
                        published_date = today.strftime('%d/%m/%Y')
                        logger.warning(f"⚠️ Aucune date trouvée pour {url}, utilisation de la date d'aujourd'hui: {published_date}")
                    else:
                        logger.àinfo(f"✅ Date de publication finalement récupérée: {published_date}")
                
            try:0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        })

    def extract_duration_from_meta(self, soup):
        """Extrait la durée depuis les métadonnées"""
        # Pattern 1: Meta property video:duration
        meta_duration = soup.find('meta', property='video:duration')
        if meta_duration:
            seconds = int(meta_duration.get('content', 0))
            return self.format_duration(seconds)
        
        # Pattern 2: Meta property og:video:duration
        og_duration = soup.find('meta', property='og:video:duration')
        if og_duration:
            seconds = int(og_duration.get('content', 0))
            return self.format_duration(seconds)
        
        # Pattern 3: JSON-LD structured data
        json_ld_scripts = soup.find_all('script', type='application/ld+json')
        for script in json_ld_scripts:
            try:
                data = json.loads(script.string)
                if isinstance(data, dict):
                    duration = data.get('duration') or data.get('timeRequired')
                    if duration:
                        return self.parse_duration_string(duration)
            except:
                continue
        
        return None

    def extract_duration_from_json(self, html_content):
        """Extrait la durée depuis les blobs JSON dans le HTML"""
        # Patterns pour trouver la durée dans les données JSON
        duration_patterns = [
            r'"duration":\s*"?(\d+)"?',
            r'"timeRequired":\s*"?(\d+)"?',
            r'"lengthSeconds":\s*"?(\d+)"?',
            r'"videoDuration":\s*"?(\d+)"?',
            r'"durationSeconds":\s*"?(\d+)"?',
            r'"length":\s*"?(\d+)"?',
            r'"time":\s*"?(\d+)"?'
        ]
        
        for pattern in duration_patterns:
            matches = re.findall(pattern, html_content, re.IGNORECASE)
            if matches:
                try:
                    seconds = int(matches[0])
                    return self.format_duration(seconds)
                except:
                    continue
        
        return None

    def format_duration(self, seconds):
        """Convertit les secondes en format HH:MM:SS"""
        hours = seconds // 3600
        minutes = (seconds % 3600) // 60
        secs = seconds % 60
        return f"{hours:02d}:{minutes:02d}:{secs:02d}"

    def parse_duration_string(self, duration_str):
        """Parse une chaîne de durée (ex: "PT1M30S")"""
        if not duration_str:
            return None
        
        # Format ISO 8601 (PT1M30S)
        if duration_str.startswith('PT'):
            duration_str = duration_str[2:]
            hours = 0
            minutes = 0
            seconds = 0
            
            # Extraire les heures
            if 'H' in duration_str:
                hours_part = duration_str.split('H')[0]
                hours = int(hours_part)
                duration_str = duration_str.split('H')[1]
            
            # Extraire les minutes
            if 'M' in duration_str:
                minutes_part = duration_str.split('M')[0]
                minutes = int(minutes_part)
                duration_str = duration_str.split('M')[1]
            
            # Extraire les secondes
            if 'S' in duration_str:
                seconds_part = duration_str.split('S')[0]
                seconds = int(seconds_part)
            
            total_seconds = hours * 3600 + minutes * 60 + seconds
            return self.format_duration(total_seconds)
        
        return None

    def scrape_instagram(self, url):
        """Scrape une vidéo Instagram en utilisant selenium"""
        from selenium import webdriver
        from selenium.webdriver.chrome.options import Options
        from selenium.webdriver.chrome.service import Service
        from selenium.webdriver.common.by import By
        from selenium.webdriver.support.ui import WebDriverWait
        from selenium.webdriver.support import expected_conditions as EC
        from webdriver_manager.chrome import ChromeDriverManager
        
        try:
            logger.info(f"🔍 Scraping Instagram: {url}")
            
            # Configuration de Chrome en mode headless
            chrome_options = Options()
            chrome_options.add_argument('--headless')
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--disable-dev-shm-usage')
            chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
            
            # Initialiser le driver avec webdriver_manager
            service = Service(ChromeDriverManager().install())
            driver = webdriver.Chrome(service=service, options=chrome_options)
            driver.get(url)
            
            # Attendre le chargement de la page
            wait = WebDriverWait(driver, 10)
            
            # Plusieurs méthodes pour trouver la description
            methods = [
                # Méthode 1: Meta description
                lambda: wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'meta[property="og:description"]'))).get_attribute('content'),
                
                # Méthode 2: Article text
                lambda: wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'article'))).text,
                
                # Méthode 3: Span dans article
                lambda: wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'article span'))).text,
                
                # Méthode 4: Div avec rôle de description
                lambda: wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '[role="menuitem"]'))).text,
                
                # Méthode 5: Nouveau sélecteur Instagram pour les descriptions
                lambda: wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '._a9zc'))).text,
                
                # Méthode 6: Autre sélecteur Instagram pour les descriptions
                lambda: wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'h1._ap3a._aaco._aacu._aacx._aad6._aade'))).text,
                
                # Méthode 7: Nouveau sélecteur pour les posts
                lambda: wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div._a9zs'))).text,
                
                # Méthode 8: Sélecteur pour le conteneur de légende
                lambda: wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div._ae5q._ae5r._ae5s'))).text,
            ]
            
            description = None
            for method in methods:
                try:
                    description = method()
                    if description:
                        logger.info(f"✅ Description trouvée: {description[:100]}...")
                        break
                except:
                    continue
            
            if not description:
                description = "Description non trouvée"
            
            # Extraire la date de publication Instagram
            published_date = "Non trouvée"
            
            # Extraire date depuis description si disponible
            if description and "on " in description:
                try:
                    import re
                    from datetime import datetime
                    match = re.search(r'on ([A-Za-z]+ \d{1,2}, \d{4}):', description)
                    if match:
                        date_str = match.group(1)
                        date_obj = datetime.strptime(date_str, '%B %d, %Y')
                        published_date = date_obj.strftime('%d/%m/%Y')
                        logger.info(f"✅ Date extraite de la description: {published_date}")
                except Exception as e:
                    logger.debug(f"Erreur extraction date: {e}")
            
            # Première tentative : extraire la date depuis la description si elle est présente
            if description and "on " in description:
                try:
                    import re
                    from datetime import datetime
                    
                    # Pattern pour capturer la date dans la description Instagram
                    # Format : "eyzuro on May 30, 2025: "
                    match = re.search(r'on ([A-Za-z]+ \d{1,2}, \d{4}):', description)
                    if match:
                        date_str = match.group(1)
                        logger.info(f"🔍 Date trouvée dans la description: {date_str}")
                        
                        try:
                            # Parser le format "May 30, 2025"
                            date_obj = datetime.strptime(date_str, '%B %d, %Y')
                            published_date = date_obj.strftime('%d/%m/%Y')
                            logger.info(f"✅ Date extraite de la description: {published_date}")
                        except ValueError as parse_error:
                            logger.debug(f"Erreur parsing date '{date_str}': {parse_error}")
                            
                except Exception as e:
                    logger.debug(f"Erreur extraction date depuis description: {e}")
            
            # Si pas de date dans la description, essayer les autres méthodes
            
            # Première tentative : extraire la date depuis la description si elle est présente
            if description and "on " in description:
                try:
                    import re
                    from datetime import datetime
                    
                    # Pattern pour capturer la date dans la description Instagram
                    # Format : "eyzuro on May 30, 2025: "
                    date_patterns = [
                        r'on ([A-Za-z]+ \d{1,2}, \d{4}):',  # "on May 30, 2025:"
                        r'on (\d{1,2} [A-Za-z]+ \d{4}):',   # "on 30 May 2025:"
                        r'on (\d{1,2}/\d{1,2}/\d{4}):',     # "on 30/05/2025:"
                        r'on (\d{4}-\d{2}-\d{2}):',         # "on 2025-05-30:"
                    ]
                    
                    for pattern in date_patterns:
                        match = re.search(pattern, description)
                        if match:
                            date_str = match.group(1)
                            logger.info(f"🔍 Date trouvée dans la description: {date_str}")
                            
                            # Tenter de parser avec différents formats
                            date_formats = [
                                '%B %d, %Y',      # May 30, 2025
                                '%d %B %Y',       # 30 May 2025
                                '%d/%m/%Y',       # 30/05/2025
                                '%Y-%m-%d',       # 2025-05-30
                            ]
                            
                            for fmt in date_formats:
                                try:
                                    date_obj = datetime.strptime(date_str, fmt)
                                    published_date = date_obj.strftime('%d/%m/%Y')
                                    logger.info(f"✅ Date extraite de la description: {published_date}")
                                    break
                                except ValueError:
                                    continue
                            
                            if published_date != "Non trouvée":
                                break
                    
                except Exception as e:
                    logger.debug(f"Erreur extraction date depuis description: {e}")
            
            # Première tentative : extraire la date depuis la description si elle est présente
            if description and "on " in description:
                try:
                    import re
                    from datetime import datetime
                    
                    # Pattern pour capturer la date dans la description Instagram
                    # Format : "eyzuro on May 30, 2025: "
                    date_patterns = [
                        r'on ([A-Za-z]+ \d{1,2}, \d{4}):',  # "on May 30, 2025:"
                        r'on (\d{1,2} [A-Za-z]+ \d{4}):',   # "on 30 May 2025:"
                        r'on (\d{1,2}/\d{1,2}/\d{4}):',     # "on 30/05/2025:"
                        r'on (\d{4}-\d{2}-\d{2}):',         # "on 2025-05-30:"
                    ]
                    
                    for pattern in date_patterns:
                        match = re.search(pattern, description)
                        if match:
                            date_str = match.group(1)
                            logger.info(f"🔍 Date trouvée dans la description: {date_str}")
                            
                            # Tenter de parser avec différents formats
                            date_formats = [
                                '%B %d, %Y',      # May 30, 2025
                                '%d %B %Y',       # 30 May 2025
                                '%d/%m/%Y',       # 30/05/2025
                                '%Y-%m-%d',       # 2025-05-30
                            ]
                            
                            for fmt in date_formats:
                                try:
                                    date_obj = datetime.strptime(date_str, fmt)
                                    published_date = date_obj.strftime('%d/%m/%Y')
                                    logger.info(f"✅ Date extraite de la description: {published_date}")
                                    break
                                except ValueError:
                                    continue
                            
                            if published_date != "Non trouvée":
                                break
                    
                except Exception as e:
                    logger.debug(f"Erreur extraction date depuis description: {e}")
            
            # Si pas de date dans la description, utiliser les méthodes existantes
            if published_date == "Non trouvée":
                try:
                    # Méthode 1: Meta property article:published_time
                    try:
                        published_meta = driver.find_element(By.CSS_SELECTOR, 'meta[property="article:published_time"]')
                        date_content = published_meta.get_attribute('content')
                        if date_content:
                            from datetime import datetime
                            date_obj = datetime.fromisoformat(date_content.replace('Z', '+00:00'))
                            published_date = date_obj.strftime('%d/%m/%Y')
                            logger.info(f"✅ Date trouvée via meta: {published_date}")
                    except Exception as e:
                        logger.debug(f"Méthode 1 échouée: {e}")
                
                # Méthode 2: Meta property og:updated_time
                if published_date == "Non trouvée":
                    try:
                        og_meta = driver.find_element(By.CSS_SELECTOR, 'meta[property="og:updated_time"]')
                        date_content = og_meta.get_attribute('content')
                        if date_content:
                            from datetime import datetime
                            date_obj = datetime.fromisoformat(date_content.replace('Z', '+00:00'))
                            published_date = date_obj.strftime('%d/%m/%Y')
                            logger.info(f"✅ Date trouvée via og:updated_time: {published_date}")
                    except Exception as e:
                        logger.debug(f"Méthode 2 échouée: {e}")
                
                # Méthode 3: Chercher dans le JSON de la page (méthode la plus fiable)
                if published_date == "Non trouvée":
                    try:
                        page_source = driver.page_source
                        import re
                        
                        # Patterns multiples pour Instagram
                        json_patterns = [
                            r'"taken_at_timestamp":(\d+)',
                            r'"taken_at":(\d+)',
                            r'"timestamp":(\d+)',
                            r'"created_time":(\d+)',
                            r'"date":(\d+)',
                            r'"upload_date":"([^"]+)"',
                            r'"published_time":"([^"]+)"',
                            # Nouveaux patterns spécifiques Instagram 2024/2025
                            r'"edge_media_to_caption".*?"taken_at_timestamp":(\d+)',
                            r'"shortcode_media".*?"taken_at_timestamp":(\d+)',
                            r'"GraphVideo".*?"taken_at_timestamp":(\d+)',
                            r'"GraphImage".*?"taken_at_timestamp":(\d+)',
                        ]
                        
                        for pattern in json_patterns:
                            matches = re.findall(pattern, page_source)
                            if matches:
                                try:
                                    for match in matches:
                                        if pattern.endswith(')"'):  # String date
                                            date_str = match
                                            # Tenter différents formats de date
                                            for date_format in ['%Y-%m-%dT%H:%M:%S.%fZ', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d']:
                                                try:
                                                    date_obj = datetime.strptime(date_str, date_format)
                                                    published_date = date_obj.strftime('%d/%m/%Y')
                                                    logger.info(f"✅ Date trouvée via pattern string {pattern}: {published_date}")
                                                    break
                                                except:
                                                    continue
                                            if published_date != "Non trouvée":
                                                break
                                        else:  # Timestamp
                                            timestamp = int(match)
                                            # Vérifier que le timestamp est raisonnable (après 2010, avant 2030)
                                            if 1262304000 < timestamp < 1893456000:  # 2010-2030
                                                date_obj = datetime.fromtimestamp(timestamp)
                                                published_date = date_obj.strftime('%d/%m/%Y')
                                                logger.info(f"✅ Date trouvée via pattern timestamp {pattern}: {published_date} (timestamp: {timestamp})")
                                                break
                                    
                                    if published_date != "Non trouvée":
                                        break
                                        
                                except Exception as parse_error:
                                    logger.debug(f"Erreur parsing pattern {pattern}: {parse_error}")
                                    continue
                        
                        # Si pas de date trouvée, chercher des patterns plus génériques
                        if published_date == "Non trouvée":
                            # Recherche de timestamps dans tous les objets JSON
                            timestamp_pattern = r'"[^"]*(?:time|date|timestamp|taken)[^"]*":\s*(\d{10})'
                            timestamp_matches = re.findall(timestamp_pattern, page_source, re.IGNORECASE)
                            
                            for timestamp_str in timestamp_matches:
                                try:
                                    timestamp = int(timestamp_str)
                                    if 1262304000 < timestamp < 1893456000:  # 2010-2030
                                        date_obj = datetime.fromtimestamp(timestamp)
                                        published_date = date_obj.strftime('%d/%m/%Y')
                                        logger.info(f"✅ Date trouvée via recherche générique timestamp: {published_date}")
                                        break
                                except:
                                    continue
                        
                    except Exception as e:
                        logger.debug(f"Méthode 3 échouée: {e}")
                
                # Méthode 4: Chercher l'élément time visible (plus rare sur Instagram)
                if published_date == "Non trouvée":
                    try:
                        time_elements = driver.find_elements(By.TAG_NAME, 'time')
                        for time_elem in time_elements:
                            datetime_attr = time_elem.get_attribute('datetime')
                            title_attr = time_elem.get_attribute('title')
                            text_content = time_elem.text
                            
                            for attr in [datetime_attr, title_attr, text_content]:
                                if attr:
                                    try:
                                        # Tenter plusieurs formats
                                        formats = [
                                            '%Y-%m-%dT%H:%M:%S.%fZ',
                                            '%Y-%m-%dT%H:%M:%SZ', 
                                            '%Y-%m-%dT%H:%M:%S',
                                            '%d/%m/%Y',
                                            '%Y-%m-%d'
                                        ]
                                        
                                        for fmt in formats:
                                            try:
                                                date_obj = datetime.strptime(attr, fmt)
                                                published_date = date_obj.strftime('%d/%m/%Y')
                                                logger.info(f"✅ Date trouvée via time element: {published_date}")
                                                break
                                            except:
                                                continue
                                        
                                        if published_date != "Non trouvée":
                                            break
                                    except:
                                        continue
                                
                            if published_date != "Non trouvée":
                                break
                                
                    except Exception as e:
                        logger.debug(f"Méthode 4 échouée: {e}")
                
                # Méthode 5: Analyse avancée de l'URL et fallback
                if published_date == "Non trouvée":
                    try:
                        import re
                        # Parfois l'ID Instagram contient des infos temporelles
                        url_match = re.search(r'/p/([A-Za-z0-9_-]+)/', url)
                        if url_match:
                            post_id = url_match.group(1)
                            logger.info(f"Post ID Instagram: {post_id}")
                            
                            # Certains IDs Instagram ont un pattern temporel, mais c'est complexe
                            # Pour l'instant, on log juste l'ID pour le debugging
                    except Exception as e:
                        logger.debug(f"Méthode 5 échouée: {e}")
                
                # Fallback: Utiliser la date du jour si aucune date trouvée
                if published_date == "Non trouvée":
                    from datetime import datetime
                    today = datetime.now()
                    published_date = today.strftime('%d/%m/%Y')
                    logger.warning(f"⚠️ Aucune date trouvée pour {url}, utilisation de la date d'aujourd'hui: {published_date}")
                else:
                    logger.info(f"✅ Date de publication finalement récupérée: {published_date}")
                
            except Exception as general_date_error:
                from datetime import datetime
                today = datetime.now()
                published_date = today.strftime('%d/%m/%Y')
                logger.error(f"❌ Erreur extraction date Instagram: {general_date_error}")
                logger.warning(f"⚠️ Utilisation de la date d'aujourd'hui: {published_date}")
                
            try:
                # Durée de la vidéo
                video_elem = wait.until(
                    EC.presence_of_element_located((By.TAG_NAME, "video"))
                )
                duration = str(round(float(video_elem.get_attribute("duration"))))
                duration = self.format_duration(int(duration))
            except:
                duration = "Non trouvée"
                
            # L'URL canonique est la même que l'URL d'entrée
            canonical_url = url
            
            # Fermer le driver
            driver.quit()
            
            return {
                'url': canonical_url,
                'description': description,
                'duration': duration,
                'published_date': published_date,
                'platform': 'Instagram'
            }
            
        except Exception as e:
            logger.error(f"❌ Erreur scraping Instagram: {e}")
            return {
                'url': url,
                'description': 'Erreur lors du scraping',
                'duration': 'Non trouvée',
                'published_date': 'Non trouvée',
                'platform': 'Instagram'
            }

    def scrape_youtube(self, url):
        """Scrape une vidéo YouTube en utilisant yt-dlp"""
        import yt_dlp
        from datetime import datetime
        
        try:
            logger.info(f"🔍 Scraping YouTube: {url}")
            
            # Configuration de yt-dlp
            ydl_opts = {
                'quiet': True,
                'no_warnings': True,
                'extract_flat': True
            }
            
            # Extraire les informations de la vidéo
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                video_info = ydl.extract_info(url, download=False)
                
                # Extraire la description
                description = video_info.get('description', '')
                if not description:
                    description = video_info.get('title', 'Description non trouvée')
                
                # Extraire la durée
                duration_seconds = video_info.get('duration')
                if duration_seconds:
                    duration = self.format_duration(duration_seconds)
                else:
                    duration = "Non trouvée"
                
                # Extraire la date de publication
                upload_date = video_info.get('upload_date')
                published_date = "Non trouvée"
                if upload_date:
                    try:
                        # Convertir YYYYMMDD en format lisible
                        date_obj = datetime.strptime(upload_date, '%Y%m%d')
                        published_date = date_obj.strftime('%d/%m/%Y')
                    except:
                        # Fallback avec timestamp
                        timestamp = video_info.get('timestamp')
                        if timestamp:
                            date_obj = datetime.fromtimestamp(timestamp)
                            published_date = date_obj.strftime('%d/%m/%Y')
                
                return {
                    'url': url,
                    'description': description,
                    'duration': duration,
                    'published_date': published_date,
                    'platform': 'YouTube'
                }
        except Exception as e:
            logger.error(f"❌ Erreur scraping YouTube: {e}")
            return {
                'url': url,
                'description': f'Erreur lors du scraping : {str(e)}',
                'duration': 'Non trouvée',
                'published_date': 'Non trouvée',
                'platform': 'YouTube'
            }



    def scrape_video(self, url):
        """Scrape une vidéo selon la plateforme détectée"""
        parsed_url = urlparse(url)
        domain = parsed_url.netloc.lower()
        
        if 'instagram.com' in domain:
            return self.scrape_instagram(url)
        elif 'youtube.com' in domain or 'youtu.be' in domain:
            return self.scrape_youtube(url)
        else:
            logger.warning(f"⚠️ Plateforme non supportée: {domain}")
            return {
                'url': url,
                'description': 'Plateforme non supportée',
                'duration': 'Non trouvée',
                'published_date': 'Non trouvée',
                'platform': 'Inconnue'
            } 