a
    &hV7                     @   sf   d dl Z d dlZd dlZd dlmZ d dlmZmZ d dlZej	ej
d eeZG dd dZdS )    N)BeautifulSoup)urlparseparse_qs)levelc                   @   sL   e Zd Zdd Zdd Zdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dS )VideoScraperc              	   C   s*   t  | _| jjddddddd d S )NzmMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0 Safari/537.36zUtext/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8zen-US,en;q=0.5zgzip, deflate, brz
keep-alive1)z
User-AgentAcceptzAccept-LanguagezAccept-Encoding
ConnectionzUpgrade-Insecure-Requests)requestsSessionsessionheadersupdate)self r   3/var/www/html/swiplay.fr/scambot/modules/scraper.py__init__   s    
zVideoScraper.__init__c           	      C   s   |j ddd}|r,t|dd}| |S |j ddd}|rXt|dd}| |S |jddd	}|D ]X}zBt|j}t|t	r|d
p|d}|r| 
|W   S W qj   Y qjY qj0 qjdS )u*   Extrait la durée depuis les métadonnéesmetazvideo:duration)propertycontentr   zog:video:durationscriptzapplication/ld+json)typedurationZtimeRequiredN)findintgetformat_durationZfind_alljsonloadsstring
isinstancedictparse_duration_string)	r   ZsoupZmeta_durationsecondsZog_durationZjson_ld_scriptsr   datar   r   r   r   extract_duration_from_meta   s&    


z'VideoScraper.extract_duration_from_metac                 C   sX   g d}|D ]F}t ||t j}|rzt|d }| |W   S    Y qY q0 qdS )u4   Extrait la durée depuis les blobs JSON dans le HTML)z"duration":\s*"?(\d+)"?z"timeRequired":\s*"?(\d+)"?z"lengthSeconds":\s*"?(\d+)"?z"videoDuration":\s*"?(\d+)"?z"durationSeconds":\s*"?(\d+)"?z"length":\s*"?(\d+)"?z"time":\s*"?(\d+)"?r   N)refindall
IGNORECASEr   r   )r   Zhtml_contentZduration_patternspatternmatchesr#   r   r   r   extract_duration_from_json4   s    
z'VideoScraper.extract_duration_from_jsonc                 C   s6   |d }|d d }|d }|dd|dd|dS )z)Convertit les secondes en format HH:MM:SS  <   Z02d:r   )r   r#   hoursminutesZsecsr   r   r   r   L   s    zVideoScraper.format_durationc           	      C   s   |sdS | dr|dd }d}d}d}d|v rV|dd }t|}|dd }d|v r|dd }t|}|dd }d|v r|dd }t|}|d	 |d
  | }| |S dS )u+   Parse une chaîne de durée (ex: "PT1M30S")NZPT   r   H   MSr,   r-   )
startswithsplitr   r   )	r   Zduration_strr/   r0   r#   Z
hours_partZminutes_partZseconds_parttotal_secondsr   r   r   r"   S   s*    

z"VideoScraper.parse_duration_stringc              
      s  ddl m} ddlm} ddlm} ddlm  ddlm	} ddl
m ddlm} ztd	|  | }|d
 |d |d |d ||  }|j||d}	|	| ||	d fdd fdd fdd fdd fdd fdd fdd fddg}
d}|
D ]L}z2| }|rvtd|dd  d W  qW n   Y qBY n0 qB|sd}d}d}|rz8ddl}|d |}|r|d!}td"|  W n6 ty } ztd#|  W Y d}~n
d}~0 0 |rd$|v rzZddl}dd%lm} |d&|}|r|d!}||d'}|d(}td)|  W n6 ty } ztd*|  W Y d}~n
d}~0 0 |dkrdd%lm} | }|d(}td+|  z> j d,f}t!t"t#|$d-}| %t&|}W n   d}Y n0 |	'  |||||d.W S  ty } z,t(d/|  |d0dddd.W  Y d}~S d}~0 0 dS )1u1   Scrape une vidéo Instagram en utilisant seleniumr   )	webdriver)Options)Service)By)WebDriverWait)expected_conditions)ChromeDriverManageru   🔍 Scraping Instagram: z
--headlessz--no-sandboxz--disable-dev-shm-usagez|--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36)serviceoptions
   c                      s     jdfdS )Nzmeta[property="og:description"]r   )untilpresence_of_element_locatedCSS_SELECTORget_attributer   r<   ZECwaitr   r   <lambda>       z/VideoScraper.scrape_instagram.<locals>.<lambda>c                      s     jdfjS )NZarticlerC   rD   rE   textr   rG   r   r   rI      rJ   c                      s     jdfjS )Nzarticle spanrK   r   rG   r   r   rI      rJ   c                      s     jdfjS )Nz[role="menuitem"]rK   r   rG   r   r   rI      rJ   c                      s     jdfjS )Nz._a9zcrK   r   rG   r   r   rI      rJ   c                      s     jdfjS )Nz&h1._ap3a._aaco._aacu._aacx._aad6._aaderK   r   rG   r   r   rI      rJ   c                      s     jdfjS )Nz	div._a9zsrK   r   rG   r   r   rI      rJ   c                      s     jdfjS )Nzdiv._ae5q._ae5r._ae5srK   r   rG   r   r   rI      rJ   Nu   ✅ Description trouvée: d   z...   Description non trouvée   Non trouvéeZ	Instagramz5-\s+([a-zA-Z0-9_.]+)\s+on\s+[A-Za-z]+ \d{1,2}, \d{4}:r3   u   ✅ Pseudo créateur trouvé: zErreur extraction pseudo: zon datetimezon ([A-Za-z]+ \d{1,2}, \d{4}):z	%B %d, %Y%d/%m/%Yu%   ✅ Date extraite de la description: zErreur extraction date: uC   ⚠️ Aucune date trouvée, utilisation de la date d'aujourd'hui: videor   urldescriptionr   published_dateplatformu   ❌ Erreur scraping Instagram: zErreur lors du scraping))Zseleniumr9   Z!selenium.webdriver.chrome.optionsr:   Z!selenium.webdriver.chrome.servicer;   Zselenium.webdriver.common.byr<   Zselenium.webdriver.support.uir=   Zselenium.webdriver.supportr>   Zwebdriver_manager.chromer?   loggerinfoadd_argumentZinstallZChromer   r&   searchgroup	ExceptiondebugrQ   strptimestrftimenowwarningrC   rD   ZTAG_NAMEstrroundfloatrF   r   r   quiterror)r   rU   r9   r:   r;   r=   r?   Zchrome_optionsr@   ZdrivermethodsrV   methodrW   creator_pseudor&   Zpseudo_matcherQ   matchdate_strdate_objtodayZ
video_elemr   r   rG   r   scrape_instagramu   s    






&

&


zVideoScraper.scrape_instagramc              
   C   sn  ddl }ddlm} ztd|  dddd}||}|j|dd}|d	d
}|sj|dd}|d}|r| |}	nd}	|d}
d}|
rz||
d}|	d}W n.   |d}|r|
|}|	d}Y n0 |||	|ddW  d   W S 1 s0    Y  W nP tyh } z6td|  |dt| ddddW  Y d}~S d}~0 0 dS )u-   Scrape une vidéo YouTube en utilisant yt-dlpr   NrP   u   🔍 Scraping YouTube: T)quietZno_warningsZextract_flatF)ZdownloadrV    titlerN   r   rO   upload_datez%Y%m%drR   	timestampZYouTuberT   u   ❌ Erreur scraping YouTube: zErreur lors du scraping : )yt_dlprQ   rY   rZ   Z	YoutubeDLZextract_infor   r   r`   ra   fromtimestampr^   rh   rd   )r   rU   rw   rQ   Zydl_optsZydlZ
video_inforV   duration_secondsr   ru   rW   ro   rv   rl   r   r   r   scrape_youtube   sT    



,zVideoScraper.scrape_youtubec                 C   sb   t |}|j }d|v r$| |S d|v s4d|v r>| |S td|  |dddddS d	S )
u0   Scrape une vidéo selon la plateforme détectéezinstagram.comzyoutube.comzyoutu.beu"   ⚠️ Plateforme non supportée: u   Plateforme non supportéerO   ZInconnuerT   N)r   netloclowerrq   rz   rY   rc   )r   rU   
parsed_urldomainr   r   r   scrape_video;  s    


zVideoScraper.scrape_videoN)__name__
__module____qualname__r   r%   r+   r   r"   rq   rz   r   r   r   r   r   r      s   " ?r   )r
   r&   r   Zbs4r   urllib.parser   r   loggingbasicConfigINFO	getLoggerr   rY   r   r   r   r   r   <module>   s   
