#!/usr/bin/env python3
"""
Script pour tester le scraping des vidéos Instagram avec la même approche que le tracker
"""

import requests
import re
import json
from bs4 import BeautifulSoup
import logging
import time

# Configuration du logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def test_instagram_video():
    """Test du scraping des vidéos Instagram avec l'approche du tracker"""
    
    # URL de test
    url = "https://www.instagram.com/reel/DMpc37-tJdF/"
    shortcode = url.split('/reel/')[1].split('/')[0]
    
    print(f"🔍 Test scraping vidéo Instagram: {url}")
    print(f"📝 Shortcode: {shortcode}")
    
    # Headers identiques au tracker
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'Referer': 'https://www.instagram.com/',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'iframe',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'same-origin'
    }
    
    session = requests.Session()
    session.headers.update(headers)
    
    try:
        # Test 1: URL embed de la vidéo (comme pour les profils)
        print("\n📡 Test 1: URL embed de la vidéo")
        embed_url = f"https://www.instagram.com/reel/{shortcode}/embed/"
        response = session.get(embed_url, timeout=15)
        print(f"Status: {response.status_code}")
        print(f"Content length: {len(response.text)}")
        
        if response.status_code == 200:
            with open('instagram_video_embed.html', 'w', encoding='utf-8') as f:
                f.write(response.text)
            print("✅ Contenu embed vidéo sauvegardé")
            
            # Analyser avec les patterns du tracker
            html = response.text
            
            # Patterns pour la durée (similaires aux patterns du tracker)
            duration_patterns = [
                {
                    "name": "Video Duration",
                    "regex": r'"duration":\s*"?(\d+)"?',
                    "priority": 1
                },
                {
                    "name": "Video Duration Seconds", 
                    "regex": r'"video_duration":\s*"?(\d+)"?',
                    "priority": 2
                },
                {
                    "name": "Media Duration",
                    "regex": r'"media_duration":\s*"?(\d+)"?',
                    "priority": 3
                },
                {
                    "name": "Duration Seconds",
                    "regex": r'"durationSeconds":\s*"?(\d+)"?',
                    "priority": 4
                }
            ]
            
            for pattern in duration_patterns:
                matches = re.findall(pattern["regex"], html, re.IGNORECASE)
                if matches:
                    unique_durations = list(set([int(match) for match in matches]))
                    if len(unique_durations) == 1:
                        duration = unique_durations[0]
                        print(f"✅ Durée trouvée avec pattern '{pattern['name']}': {duration} secondes")
                        break
            else:
                print("❌ Aucune durée trouvée")
            
            # Patterns pour la description
            desc_patterns = [
                {
                    "name": "Caption Text",
                    "regex": r'"caption":\s*"([^"]*)"',
                    "priority": 1
                },
                {
                    "name": "Edge Media Caption",
                    "regex": r'"edge_media_to_caption":\s*\{\s*"edges":\s*\[\s*\{\s*"node":\s*\{\s*"text":\s*"([^"]*)"',
                    "priority": 2
                },
                {
                    "name": "Text Content",
                    "regex": r'"text":\s*"([^"]*)"',
                    "priority": 3
                },
                {
                    "name": "Description",
                    "regex": r'"description":\s*"([^"]*)"',
                    "priority": 4
                }
            ]
            
            for pattern in desc_patterns:
                matches = re.findall(pattern["regex"], html, re.IGNORECASE | re.DOTALL)
                if matches:
                    # Prendre la première description non vide
                    for match in matches:
                        if match and len(match.strip()) > 10:  # Au moins 10 caractères
                            print(f"✅ Description trouvée avec pattern '{pattern['name']}': {match[:100]}...")
                            break
                    else:
                        continue
                    break
            else:
                print("❌ Aucune description trouvée")
        
        # Test 2: URL normale avec les mêmes patterns
        print("\n📡 Test 2: URL normale")
        response_normal = session.get(url, timeout=15)
        print(f"Status: {response_normal.status_code}")
        print(f"Content length: {len(response_normal.text)}")
        
        if response_normal.status_code == 200:
            with open('instagram_video_normal.html', 'w', encoding='utf-8') as f:
                f.write(response_normal.text)
            print("✅ Contenu normal sauvegardé")
            
            # Analyser avec les mêmes patterns
            html_normal = response_normal.text
            
            for pattern in duration_patterns:
                matches = re.findall(pattern["regex"], html_normal, re.IGNORECASE)
                if matches:
                    unique_durations = list(set([int(match) for match in matches]))
                    if len(unique_durations) == 1:
                        duration = unique_durations[0]
                        print(f"✅ Durée normale trouvée avec pattern '{pattern['name']}': {duration} secondes")
                        break
            else:
                print("❌ Aucune durée trouvée dans l'URL normale")
            
            for pattern in desc_patterns:
                matches = re.findall(pattern["regex"], html_normal, re.IGNORECASE | re.DOTALL)
                if matches:
                    for match in matches:
                        if match and len(match.strip()) > 10:
                            print(f"✅ Description normale trouvée avec pattern '{pattern['name']}': {match[:100]}...")
                            break
                    else:
                        continue
                    break
            else:
                print("❌ Aucune description trouvée dans l'URL normale")
        
        # Test 3: Analyser les fichiers sauvegardés
        print("\n📊 Test 3: Analyse approfondie")
        
        files_to_check = ['instagram_video_embed.html', 'instagram_video_normal.html']
        
        for filename in files_to_check:
            try:
                with open(filename, 'r', encoding='utf-8') as f:
                    content = f.read()
                
                print(f"\n🔍 Analyse de {filename}:")
                
                # Chercher des patterns plus spécifiques à Instagram
                instagram_patterns = [
                    r'"shortcode":\s*"([^"]*)"',
                    r'"owner":\s*\{\s*"username":\s*"([^"]*)"',
                    r'"video_url":\s*"([^"]*)"',
                    r'"display_url":\s*"([^"]*)"',
                    r'"thumbnail_src":\s*"([^"]*)"',
                    r'"is_video":\s*(true|false)',
                    r'"video_view_count":\s*"?(\d+)"?',
                    r'"like_count":\s*"?(\d+)"?',
                    r'"comment_count":\s*"?(\d+)"?',
                    r'"taken_at_timestamp":\s*"?(\d+)"?',
                    r'"accessibility_caption":\s*"([^"]*)"'
                ]
                
                for pattern in instagram_patterns:
                    matches = re.findall(pattern, content, re.IGNORECASE | re.DOTALL)
                    if matches:
                        print(f"  ✅ Pattern '{pattern}' trouvé: {matches[0][:100]}...")
                
                # Chercher des patterns JSON plus génériques
                json_patterns = [
                    r'"(\w+)":\s*"?(\d+)"?',  # Nombres
                    r'"(\w+)":\s*"([^"]{20,200})"',  # Textes longs
                ]
                
                for pattern in json_patterns:
                    matches = re.findall(pattern, content, re.IGNORECASE)
                    if matches:
                        # Filtrer les résultats intéressants
                        for key, value in matches[:5]:  # Limiter à 5 résultats
                            if key.lower() in ['duration', 'caption', 'text', 'description', 'title']:
                                print(f"  ✅ JSON '{key}': {value[:50]}...")
                
            except FileNotFoundError:
                print(f"  ❌ Fichier {filename} non trouvé")
            except Exception as e:
                print(f"  ❌ Erreur lecture {filename}: {e}")
        
        # Test 4: Essayer avec l'URL mobile
        print("\n📱 Test 4: URL mobile")
        mobile_url = url.replace('instagram.com', 'm.instagram.com')
        response_mobile = session.get(mobile_url, timeout=15)
        print(f"Mobile Status: {response_mobile.status_code}")
        
        if response_mobile.status_code == 200:
            with open('instagram_video_mobile.html', 'w', encoding='utf-8') as f:
                f.write(response_mobile.text)
            print("✅ Contenu mobile sauvegardé")
            
            # Analyser le contenu mobile
            html_mobile = response_mobile.text
            
            for pattern in duration_patterns:
                matches = re.findall(pattern["regex"], html_mobile, re.IGNORECASE)
                if matches:
                    unique_durations = list(set([int(match) for match in matches]))
                    if len(unique_durations) == 1:
                        duration = unique_durations[0]
                        print(f"✅ Durée mobile trouvée avec pattern '{pattern['name']}': {duration} secondes")
                        break
            else:
                print("❌ Aucune durée trouvée dans l'URL mobile")
            
            for pattern in desc_patterns:
                matches = re.findall(pattern["regex"], html_mobile, re.IGNORECASE | re.DOTALL)
                if matches:
                    for match in matches:
                        if match and len(match.strip()) > 10:
                            print(f"✅ Description mobile trouvée avec pattern '{pattern['name']}': {match[:100]}...")
                            break
                    else:
                        continue
                    break
            else:
                print("❌ Aucune description trouvée dans l'URL mobile")
        
    except Exception as e:
        print(f"❌ Erreur générale: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    test_instagram_video() 