#!/usr/bin/env python3
"""
Test script to directly search for keywords during Instagram scraping
"""

import requests
import re
import json
from bs4 import BeautifulSoup
import time

def test_instagram_keywords():
    url = "https://www.instagram.com/reel/DMpc37-tJdF/"
    keywords = ["Palia", "CROSSPLAY"]
    
    print("🔍 Testing Instagram scraping with keyword search...")
    print(f"URL: {url}")
    print(f"Keywords: {keywords}")
    print()
    
    # Headers to mimic a real browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
    }
    
    # Test different URL formats
    urls_to_test = [
        url,  # Original URL
        url.replace('/reel/', '/p/'),  # Post format
        url.replace('www.instagram.com', 'www.instagram.com/embed'),  # Embed format
    ]
    
    for i, test_url in enumerate(urls_to_test, 1):
        print(f"📡 Test {i}: {test_url}")
        
        try:
            response = requests.get(test_url, headers=headers, timeout=10)
            response.raise_for_status()
            
            content = response.text
            print(f"   Status: {response.status_code}")
            print(f"   Content length: {len(content)} characters")
            
            # Search for keywords in the raw content
            found_keywords = []
            for keyword in keywords:
                if keyword.lower() in content.lower():
                    found_keywords.append(keyword)
                    print(f"   ✅ Found keyword: {keyword}")
            
            if found_keywords:
                print(f"   🎯 Keywords found: {found_keywords}")
                
                # Try to extract surrounding context
                for keyword in found_keywords:
                    # Find the keyword and get surrounding text
                    pattern = rf'([^.]*{keyword}[^.]*)'
                    matches = re.findall(pattern, content, re.IGNORECASE)
                    if matches:
                        print(f"   📝 Context for '{keyword}':")
                        for match in matches[:3]:  # Show first 3 matches
                            print(f"      ...{match.strip()}...")
            else:
                print("   ❌ No keywords found")
            
            # Also try to extract JSON data that might contain the description
            json_patterns = [
                r'<script type="application/ld\+json">(.*?)</script>',
                r'window\._sharedData = (.*?);</script>',
                r'"caption":\s*"([^"]*)"',
                r'"text":\s*"([^"]*)"',
                r'"description":\s*"([^"]*)"',
            ]
            
            for pattern in json_patterns:
                matches = re.findall(pattern, content, re.DOTALL | re.IGNORECASE)
                if matches:
                    print(f"   📄 Found JSON pattern: {pattern[:50]}...")
                    for match in matches[:2]:  # Show first 2 matches
                        if len(match) > 50:
                            match = match[:50] + "..."
                        print(f"      {match}")
            
            print()
            
        except Exception as e:
            print(f"   ❌ Error: {e}")
            print()
    
    # Try a different approach - search in the page source more thoroughly
    print("🔍 Deep search in page source...")
    try:
        response = requests.get(url, headers=headers, timeout=10)
        content = response.text
        
        # Look for any text that might contain our keywords
        lines = content.split('\n')
        for i, line in enumerate(lines):
            for keyword in keywords:
                if keyword.lower() in line.lower():
                    print(f"   📍 Line {i+1}: {line.strip()[:100]}...")
        
        # Try to find any JSON-like structures
        json_blocks = re.findall(r'\{[^{}]*"[^"]*"[^{}]*\}', content)
        for block in json_blocks[:5]:  # Check first 5 JSON-like blocks
            for keyword in keywords:
                if keyword.lower() in block.lower():
                    print(f"   📄 JSON block with '{keyword}': {block[:200]}...")
        
    except Exception as e:
        print(f"   ❌ Deep search error: {e}")

if __name__ == "__main__":
    test_instagram_keywords() 