#!/usr/bin/env python3
"""
Script final pour le scraping Instagram avec brotli
"""

import requests
import re
import json
import gzip
import brotli
from bs4 import BeautifulSoup
import logging
import time

# Configuration du logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def test_instagram_final():
    """Test final du scraping Instagram"""
    
    # URL de test
    url = "https://www.instagram.com/reel/DMpc37-tJdF/"
    
    print(f"🔍 Test scraping Instagram final: {url}")
    
    # Headers pour simuler un navigateur
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
        'Accept-Language': 'fr-FR,fr;q=0.9,en;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Cache-Control': 'no-cache',
        'Pragma': 'no-cache',
        'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
        'Sec-Ch-Ua-Mobile': '?0',
        'Sec-Ch-Ua-Platform': '"Linux"',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'Connection': 'keep-alive'
    }
    
    session = requests.Session()
    session.headers.update(headers)
    
    try:
        # Test 1: URL normale
        print("\n📡 Test 1: URL normale")
        response = session.get(url, timeout=30)
        print(f"Status: {response.status_code}")
        print(f"Content-Encoding: {response.headers.get('content-encoding', 'none')}")
        print(f"Content length: {len(response.content)}")
        
        # Décompresser avec la bonne méthode
        if response.headers.get('content-encoding') == 'gzip':
            content = gzip.decompress(response.content).decode('utf-8')
            print("✅ Contenu décompressé avec gzip")
        elif response.headers.get('content-encoding') == 'br':
            content = brotli.decompress(response.content).decode('utf-8')
            print("✅ Contenu décompressé avec brotli")
        else:
            content = response.text
            print("✅ Contenu non compressé")
        
        print(f"Contenu final length: {len(content)}")
        
        # Sauvegarder le contenu
        with open('instagram_final.html', 'w', encoding='utf-8') as f:
            f.write(content)
        print("✅ Contenu sauvegardé dans instagram_final.html")
        
        # Analyser avec BeautifulSoup
        soup = BeautifulSoup(content, 'html.parser')
        
        # Chercher les métadonnées
        print("\n🔍 Recherche des métadonnées:")
        
        # Meta tags
        meta_tags = soup.find_all('meta')
        for meta in meta_tags:
            property_attr = meta.get('property', '')
            content_attr = meta.get('content', '')
            if 'description' in property_attr.lower():
                print(f"✅ Meta description: {content_attr[:100]}...")
            elif 'duration' in property_attr.lower():
                print(f"✅ Meta duration: {content_attr}")
            elif 'title' in property_attr.lower():
                print(f"✅ Meta title: {content_attr[:100]}...")
        
        # Script tags avec JSON
        print("\n🔍 Recherche des scripts JSON:")
        script_tags = soup.find_all('script')
        for i, script in enumerate(script_tags):
            script_content = script.string
            if script_content and ('window._sharedData' in script_content or 'application/ld+json' in script_content):
                print(f"✅ Script {i+1} trouvé: {script_content[:200]}...")
                
                # Essayer de parser le JSON
                try:
                    if 'window._sharedData' in script_content:
                        json_match = re.search(r'window\._sharedData\s*=\s*({.*?});', script_content, re.DOTALL)
                        if json_match:
                            json_str = json_match.group(1)
                            json_data = json.loads(json_str)
                            print(f"  JSON parsé avec succès: {str(json_data)[:200]}...")
                except Exception as e:
                    print(f"  Erreur parsing JSON: {e}")
        
        # Recherche de patterns spécifiques
        print("\n🔍 Recherche de patterns spécifiques:")
        
        # Patterns pour la durée
        duration_patterns = [
            r'"duration":\s*"?(\d+)"?',
            r'"timeRequired":\s*"?(\d+)"?',
            r'"lengthSeconds":\s*"?(\d+)"?',
            r'"videoDuration":\s*"?(\d+)"?',
            r'"durationSeconds":\s*"?(\d+)"?',
            r'"length":\s*"?(\d+)"?',
            r'"time":\s*"?(\d+)"?',
            r'"video_duration":\s*"?(\d+)"?',
            r'"media_duration":\s*"?(\d+)"?',
            r'"videoDurationSeconds":\s*"?(\d+)"?'
        ]
        
        for pattern in duration_patterns:
            matches = re.findall(pattern, content, re.IGNORECASE)
            if matches:
                print(f"✅ Durée trouvée avec pattern '{pattern}': {matches[0]} secondes")
                break
        else:
            print("❌ Aucune durée trouvée")
        
        # Patterns pour la description
        desc_patterns = [
            r'"description":\s*"([^"]*)"',
            r'"caption":\s*"([^"]*)"',
            r'"text":\s*"([^"]*)"',
            r'"title":\s*"([^"]*)"',
            r'<meta[^>]*property="og:description"[^>]*content="([^"]*)"',
            r'<meta[^>]*name="description"[^>]*content="([^"]*)"',
            r'"edge_media_to_caption":\s*\{\s*"edges":\s*\[\s*\{\s*"node":\s*\{\s*"text":\s*"([^"]*)"'
        ]
        
        for pattern in desc_patterns:
            matches = re.findall(pattern, content, re.IGNORECASE | re.DOTALL)
            if matches:
                print(f"✅ Description trouvée avec pattern '{pattern}': {matches[0][:100]}...")
                break
        else:
            print("❌ Aucune description trouvée")
        
        # Test 2: URL embed
        print("\n📡 Test 2: URL embed")
        embed_url = url.replace('/reel/', '/reel/embed/')
        time.sleep(1)
        
        response_embed = session.get(embed_url, timeout=30)
        print(f"Status: {response_embed.status_code}")
        
        if response_embed.status_code == 200:
            # Décompresser l'embed
            if response_embed.headers.get('content-encoding') == 'gzip':
                content_embed = gzip.decompress(response_embed.content).decode('utf-8')
            elif response_embed.headers.get('content-encoding') == 'br':
                content_embed = brotli.decompress(response_embed.content).decode('utf-8')
            else:
                content_embed = response_embed.text
            
            print(f"Contenu embed length: {len(content_embed)}")
            
            # Sauvegarder l'embed
            with open('instagram_embed_final.html', 'w', encoding='utf-8') as f:
                f.write(content_embed)
            print("✅ Embed sauvegardé")
            
            # Analyser l'embed
            soup_embed = BeautifulSoup(content_embed, 'html.parser')
            
            # Chercher les métadonnées dans l'embed
            meta_embed = soup_embed.find_all('meta')
            for meta in meta_embed:
                property_attr = meta.get('property', '')
                content_attr = meta.get('content', '')
                if 'description' in property_attr.lower():
                    print(f"✅ Embed description: {content_attr[:100]}...")
                elif 'duration' in property_attr.lower():
                    print(f"✅ Embed duration: {content_attr}")
            
            # Chercher les patterns dans l'embed
            for pattern in duration_patterns:
                matches = re.findall(pattern, content_embed, re.IGNORECASE)
                if matches:
                    print(f"✅ Embed durée trouvée avec pattern '{pattern}': {matches[0]} secondes")
                    break
            else:
                print("❌ Aucune durée trouvée dans l'embed")
            
            for pattern in desc_patterns:
                matches = re.findall(pattern, content_embed, re.IGNORECASE | re.DOTALL)
                if matches:
                    print(f"✅ Embed description trouvée avec pattern '{pattern}': {matches[0][:100]}...")
                    break
            else:
                print("❌ Aucune description trouvée dans l'embed")
        
        # Test 3: Analyser le contenu pour trouver des informations
        print("\n📊 Test 3: Analyse approfondie du contenu")
        
        # Chercher des patterns plus spécifiques à Instagram
        instagram_specific = [
            r'"shortcode":\s*"([^"]*)"',
            r'"owner":\s*\{\s*"username":\s*"([^"]*)"',
            r'"video_url":\s*"([^"]*)"',
            r'"display_url":\s*"([^"]*)"',
            r'"thumbnail_src":\s*"([^"]*)"',
            r'"is_video":\s*(true|false)',
            r'"video_view_count":\s*"?(\d+)"?',
            r'"like_count":\s*"?(\d+)"?',
            r'"comment_count":\s*"?(\d+)"?'
        ]
        
        for pattern in instagram_specific:
            matches = re.findall(pattern, content, re.IGNORECASE | re.DOTALL)
            if matches:
                print(f"✅ Info Instagram trouvée avec pattern '{pattern}': {matches[0][:100]}...")
        
        # Afficher un extrait du contenu pour debug
        print(f"\n📄 Extrait du contenu (premiers 1000 caractères):")
        print(content[:1000])
        
    except Exception as e:
        print(f"❌ Erreur générale: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    test_instagram_final() 