interface ScrapedMedia { url: string; title?: string; description?: string; type: 'image' | 'video' | 'gif'; source: 'pinterest' | 'tenor' | 'direct'; filename?: string; } type MediaVariant = { url: string; width?: number; height?: number; label?: string }; class ScraperService { private readonly USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'; async scrapeUrl(url: string): Promise { const urlObj = new URL(url); const hostname = urlObj.hostname.toLowerCase(); if (hostname.includes('pinterest.com') || hostname.includes('pin.it')) { return await this.scrapePinterest(url); } if (hostname.includes('tenor.com')) { return await this.scrapeTenor(url); } if (this.isDirectMediaUrl(url)) { return { url, type: this.guessMediaType(url), source: 'direct', }; } const mediaType = await this.checkMediaContentType(url); if (mediaType) { return { url, type: mediaType, source: 'direct', }; } throw new Error(`Unsupported URL: ${url}`); } private async scrapePinterest(url: string): Promise { try { let pinId = this.extractPinterestId(url); if (url.includes('pin.it/')) { const shortLink = url.split('pin.it/')[1].split('/')[0].split('?')[0]; const redirectUrl = await this.resolveRedirect( `https://api.pinterest.com/url_shortener/${shortLink}/redirect/` ); if (redirectUrl) { pinId = this.extractPinterestId(redirectUrl); } } if (!pinId) { throw new Error('Could not extract Pinterest pin ID'); } if (pinId.includes('--')) { pinId = pinId.split('--')[1]; } const html = await fetch(`https://www.pinterest.com/pin/${pinId}/`, { headers: { 'User-Agent': this.USER_AGENT, }, }).then((r) => r.text()); const notFoundRegex = /"__typename"\s*:\s*"PinNotFound"/; if (notFoundRegex.test(html)) { throw new Error('Pinterest pin not found'); } const videoRegex = /"url":"(https:\/\/v1\.pinimg\.com\/videos\/.*?)"/g; const videoMatches = [...html.matchAll(videoRegex)]; const videoLink = videoMatches .map(([, link]) => link) .find((a) => a.endsWith('.mp4')); if (videoLink) { return { url: videoLink, type: 'video', source: 'pinterest', filename: `pinterest_${pinId}.mp4`, }; } const imageRegex = /src="(https:\/\/i\.pinimg\.com\/.*?\.(jpg|gif))"/g; const imageMatches = [...html.matchAll(imageRegex)]; const imageLink = imageMatches .map(([, link]) => link) .find((a) => a.endsWith('.jpg') || a.endsWith('.gif')); if (imageLink) { const isGif = imageLink.endsWith('.gif'); return { url: imageLink, type: isGif ? 'gif' : 'image', source: 'pinterest', filename: `pinterest_${pinId}.${isGif ? 'gif' : 'jpg'}`, }; } throw new Error('Could not extract media URL from Pinterest'); } catch (error) { throw new Error( `Failed to scrape Pinterest: ${error instanceof Error ? error.message : String(error)}` ); } } private async scrapeTenor(url: string): Promise { try { const idMatch = url.match(/\/view\/[^\/]+-(\d+)/); if (!idMatch) { throw new Error('Invalid Tenor URL format'); } const gifId = idMatch[1]; const html = await fetch(url, { redirect: 'follow', headers: { 'User-Agent': this.USER_AGENT, 'Accept': 'text/html,application/xhtml+xml', }, }).then((r) => r.text()); const mediaJson = this.extractTenorMediaJson(html); const mediaVariants = this.collectTenorMedia(mediaJson); const chosen = this.pickTenorVariant(mediaVariants, 'mp4', 'medium'); if (!chosen) { throw new Error('Could not find suitable media variant'); } return { url: chosen.url, type: 'gif', source: 'tenor', filename: `tenor_${gifId}.mp4`, }; } catch (error) { throw new Error( `Failed to scrape Tenor: ${error instanceof Error ? error.message : String(error)}` ); } } private extractTenorMediaJson(html: string): any { const candidates: string[] = []; const stateMatch = html.match(/window\.__STATE__\s*=\s*(\{[\s\S]*?\});/); if (stateMatch?.[1]) { candidates.push(stateMatch[1]); } const appConfigIdx = html.indexOf('{"appConfig"'); if (appConfigIdx !== -1) { const slice = html.slice(appConfigIdx, appConfigIdx + 2_000_000); const endTagIdx = slice.indexOf(''); const jsonCandidate = endTagIdx !== -1 ? slice.slice(0, endTagIdx) : slice; candidates.push(jsonCandidate); } const nextDataMatch = html.match( /]*id="__NEXT_DATA__"[^>]*>([\s\S]*?)<\/script>/ ); if (nextDataMatch?.[1]) { candidates.push(nextDataMatch[1]); } for (const raw of candidates) { try { return JSON.parse(raw); } catch { const extracted = this.extractFirstJsonObject(raw); if (extracted) { try { return JSON.parse(extracted); } catch { // ignore } } } } throw new Error('Unable to locate Tenor media JSON in page'); } private extractFirstJsonObject(s: string): string | null { let depth = 0; let inStr: string | null = null; let escape = false; let start = -1; for (let i = 0; i < s.length; i++) { const c = s[i]; if (inStr) { if (escape) { escape = false; } else if (c === '\\') { escape = true; } else if (c === inStr) { inStr = null; } continue; } else { if (c === '"' || c === "'") { inStr = c; continue; } if (c === '{') { if (depth === 0) start = i; depth++; } else if (c === '}') { depth--; if (depth === 0 && start !== -1) { return s.slice(start, i + 1); } } } } return null; } private collectTenorMedia(data: any): Record { const out: Record = {}; const formatKeys = new Set([ 'gif', 'mediumgif', 'tinygif', 'nanogif', 'mp4', 'loopedmp4', 'tinymp4', 'nanomp4', 'webm', 'tinywebm', 'nanowebm', 'preview', ]); const push = (format: string, url: string, meta?: Partial) => { if (!out[format]) out[format] = []; out[format].push({ url, ...meta }); }; const visit = (node: any) => { if (!node) return; if (Array.isArray(node)) { for (const v of node) visit(v); return; } if (typeof node === 'object') { for (const k of Object.keys(node)) { if (formatKeys.has(k) && node[k] && typeof node[k] === 'object') { const v = node[k]; if (typeof v.url === 'string') { push(k, v.url, { width: typeof v.dims?.[0] === 'number' ? v.dims[0] : v.width, height: typeof v.dims?.[1] === 'number' ? v.dims[1] : v.height, }); } else if (typeof v === 'string') { push(k, v); } } } for (const k of Object.keys(node)) { visit(node[k]); } } }; visit(data); return out; } private pickTenorVariant( media: Record, prefFormat: 'gif' | 'mp4' | 'webm', prefSize: 'original' | 'medium' | 'tiny' | 'nano' ): MediaVariant | null { const sizeToFormats: Record< 'gif' | 'mp4' | 'webm', Record<'original' | 'medium' | 'tiny' | 'nano', string[]> > = { gif: { original: ['gif'], medium: ['mediumgif', 'gif'], tiny: ['tinygif', 'nanogif', 'gif'], nano: ['nanogif', 'tinygif', 'gif'], }, mp4: { original: ['mp4', 'loopedmp4'], medium: ['mp4', 'tinymp4'], tiny: ['tinymp4', 'nanomp4', 'mp4'], nano: ['nanomp4', 'tinymp4', 'mp4'], }, webm: { original: ['webm'], medium: ['webm', 'tinywebm'], tiny: ['tinywebm', 'nanowebm', 'webm'], nano: ['nanowebm', 'tinywebm', 'webm'], }, }; const order = sizeToFormats[prefFormat][prefSize]; for (const key of order) { const list = media[key]; if (list?.length) { const best = [...list].sort((a, b) => (b.width ?? 0) - (a.width ?? 0))[0]; return best; } } const anyKeys = Object.keys(media).filter((k) => k.includes(prefFormat)); for (const k of anyKeys) { const list = media[k]; if (list?.length) return list[0]; } return null; } private extractPinterestId(url: string): string | null { const match = url.match(/\/pin\/([^\/\?]+)/); return match ? match[1] : null; } private async resolveRedirect(url: string): Promise { try { const response = await fetch(url, { redirect: 'manual', headers: { 'User-Agent': this.USER_AGENT, }, }); const location = response.headers.get('location'); return location; } catch (error) { console.warn('Failed to resolve redirect:', error); return null; } } private isDirectMediaUrl(url: string): boolean { const mediaExtensions = [ '.jpg', '.jpeg', '.png', '.gif', '.webp', '.avif', '.mp4', '.webm', '.mov', ]; const lowerUrl = url.toLowerCase(); return mediaExtensions.some((ext) => lowerUrl.includes(ext)); } private guessMediaType(url: string): 'image' | 'video' | 'gif' { const lowerUrl = url.toLowerCase(); if (lowerUrl.includes('.gif')) return 'gif'; if ( lowerUrl.includes('.mp4') || lowerUrl.includes('.webm') || lowerUrl.includes('.mov') ) return 'video'; return 'image'; } private async checkMediaContentType(url: string): Promise<'image' | 'video' | 'gif' | null> { try { const response = await fetch(url, { method: 'HEAD', headers: { 'User-Agent': this.USER_AGENT, }, }); if (!response.ok) { return null; } const contentType = response.headers.get('content-type')?.toLowerCase() || ''; if (contentType === 'image/gif') return 'gif'; if (contentType.startsWith('image/')) return 'image'; if (contentType.startsWith('video/')) return 'video'; return null; } catch (error) { try { const response = await fetch(url, { method: 'GET', headers: { 'User-Agent': this.USER_AGENT, 'Range': 'bytes=0-1023', }, }); if (!response.ok) { return null; } const contentType = response.headers.get('content-type')?.toLowerCase() || ''; if (contentType === 'image/gif') return 'gif'; if (contentType.startsWith('image/')) return 'image'; if (contentType.startsWith('video/')) return 'video'; return null; } catch { return null; } } } } export const scraperService = new ScraperService();