#!/usr/bin/env python3
"""
Script de test spécifique pour le scraping Instagram
"""

import requests
import re
import json
from bs4 import BeautifulSoup
import logging

# Configuration du logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def test_instagram_scraping():
    """Test spécifique du scraping Instagram"""
    
    # URL de test
    url = "https://www.instagram.com/reel/DMpc37-tJdF/"
    
    print(f"🔍 Test scraping Instagram: {url}")
    
    # Headers pour simuler un navigateur
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'Referer': 'https://www.instagram.com/',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'iframe',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'same-origin'
    }
    
    session = requests.Session()
    session.headers.update(headers)
    
    try:
        # Test 1: URL normale
        print("\n📡 Test 1: URL normale")
        response = session.get(url, timeout=15)
        print(f"Status: {response.status_code}")
        print(f"Content length: {len(response.text)}")
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Chercher la description
            meta_desc = soup.find('meta', property='og:description')
            if meta_desc:
                description = meta_desc.get('content', '')
                print(f"✅ Description trouvée: {description[:200]}...")
            else:
                print("❌ Description non trouvée")
            
            # Chercher la durée dans les métadonnées
            meta_duration = soup.find('meta', property='video:duration')
            if meta_duration:
                duration = meta_duration.get('content', '')
                print(f"✅ Durée trouvée: {duration}")
            else:
                print("❌ Durée non trouvée dans les métadonnées")
            
            # Chercher la durée dans le JSON
            duration_patterns = [
                r'"duration":\s*"?(\d+)"?',
                r'"timeRequired":\s*"?(\d+)"?',
                r'"lengthSeconds":\s*"?(\d+)"?',
                r'"videoDuration":\s*"?(\d+)"?',
                r'"durationSeconds":\s*"?(\d+)"?',
                r'"length":\s*"?(\d+)"?',
                r'"time":\s*"?(\d+)"?'
            ]
            
            for pattern in duration_patterns:
                matches = re.findall(pattern, response.text, re.IGNORECASE)
                if matches:
                    print(f"✅ Durée trouvée avec pattern: {matches[0]} secondes")
                    break
            else:
                print("❌ Durée non trouvée dans le JSON")
        
        # Test 2: URL embed
        print("\n📡 Test 2: URL embed")
        embed_url = url.replace('/reel/', '/reel/embed/')
        response_embed = session.get(embed_url, timeout=15)
        print(f"Status: {response_embed.status_code}")
        print(f"Content length: {len(response_embed.text)}")
        
        if response_embed.status_code == 200:
            soup_embed = BeautifulSoup(response_embed.text, 'html.parser')
            
            # Chercher la description dans l'embed
            meta_desc_embed = soup_embed.find('meta', property='og:description')
            if meta_desc_embed:
                description_embed = meta_desc_embed.get('content', '')
                print(f"✅ Description embed trouvée: {description_embed[:200]}...")
            else:
                print("❌ Description embed non trouvée")
            
            # Chercher la durée dans l'embed
            for pattern in duration_patterns:
                matches = re.findall(pattern, response_embed.text, re.IGNORECASE)
                if matches:
                    print(f"✅ Durée embed trouvée avec pattern: {matches[0]} secondes")
                    break
            else:
                print("❌ Durée embed non trouvée")
        
        # Test 3: Sauvegarder le HTML pour analyse
        print("\n💾 Test 3: Sauvegarde du HTML")
        with open('instagram_test.html', 'w', encoding='utf-8') as f:
            f.write(response.text)
        print("✅ HTML sauvegardé dans instagram_test.html")
        
        # Test 4: Analyser le contenu JSON
        print("\n🔍 Test 4: Analyse du contenu JSON")
        json_patterns = [
            r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>',
            r'window\._sharedData\s*=\s*({.*?});',
            r'window\.__additionalDataLoaded[^>]*>(.*?)</script>'
        ]
        
        for pattern in json_patterns:
            matches = re.findall(pattern, response.text, re.DOTALL)
            if matches:
                print(f"✅ JSON trouvé avec pattern: {len(matches)} matches")
                for i, match in enumerate(matches[:2]):  # Limiter à 2 matches
                    try:
                        json_data = json.loads(match)
                        print(f"  JSON {i+1}: {str(json_data)[:200]}...")
                    except:
                        print(f"  JSON {i+1}: Non parsable")
            else:
                print(f"❌ JSON non trouvé avec pattern")
        
    except Exception as e:
        print(f"❌ Erreur: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    test_instagram_scraping() 