Archived
1
0
This repository has been archived on 2025-11-29. You can view files and clone it, but cannot push or open issues or pull requests.
Files
app/apps/backend/src/services/scraper.service.ts

442 lines
12 KiB
TypeScript

interface ScrapedMedia {
url: string;
title?: string;
description?: string;
type: 'image' | 'video' | 'gif';
source: 'pinterest' | 'tenor' | 'direct';
filename?: string;
}
type MediaVariant = {
url: string;
width?: number;
height?: number;
label?: string
};
class ScraperService {
private readonly USER_AGENT =
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
async scrapeUrl(url: string): Promise<ScrapedMedia> {
const urlObj = new URL(url);
const hostname = urlObj.hostname.toLowerCase();
if (hostname.includes('pinterest.com') || hostname.includes('pin.it')) {
return await this.scrapePinterest(url);
}
if (hostname.includes('tenor.com')) {
return await this.scrapeTenor(url);
}
if (this.isDirectMediaUrl(url)) {
return {
url,
type: this.guessMediaType(url),
source: 'direct',
};
}
const mediaType = await this.checkMediaContentType(url);
if (mediaType) {
return {
url,
type: mediaType,
source: 'direct',
};
}
throw new Error(`Unsupported URL: ${url}`);
}
private async scrapePinterest(url: string): Promise<ScrapedMedia> {
try {
let pinId = this.extractPinterestId(url);
if (url.includes('pin.it/')) {
const shortLink = url.split('pin.it/')[1].split('/')[0].split('?')[0];
const redirectUrl = await this.resolveRedirect(
`https://api.pinterest.com/url_shortener/${shortLink}/redirect/`
);
if (redirectUrl) {
pinId = this.extractPinterestId(redirectUrl);
}
}
if (!pinId) {
throw new Error('Could not extract Pinterest pin ID');
}
if (pinId.includes('--')) {
pinId = pinId.split('--')[1];
}
const html = await fetch(`https://www.pinterest.com/pin/${pinId}/`, {
headers: {
'User-Agent': this.USER_AGENT,
},
}).then((r) => r.text());
const notFoundRegex = /"__typename"\s*:\s*"PinNotFound"/;
if (notFoundRegex.test(html)) {
throw new Error('Pinterest pin not found');
}
const videoRegex = /"url":"(https:\/\/v1\.pinimg\.com\/videos\/.*?)"/g;
const videoMatches = [...html.matchAll(videoRegex)];
const videoLink = videoMatches
.map(([, link]) => link)
.find((a) => a.endsWith('.mp4'));
if (videoLink) {
return {
url: videoLink,
type: 'video',
source: 'pinterest',
filename: `pinterest_${pinId}.mp4`,
};
}
const imageRegex = /src="(https:\/\/i\.pinimg\.com\/.*?\.(jpg|gif))"/g;
const imageMatches = [...html.matchAll(imageRegex)];
const imageLink = imageMatches
.map(([, link]) => link)
.find((a) => a.endsWith('.jpg') || a.endsWith('.gif'));
if (imageLink) {
const isGif = imageLink.endsWith('.gif');
return {
url: imageLink,
type: isGif ? 'gif' : 'image',
source: 'pinterest',
filename: `pinterest_${pinId}.${isGif ? 'gif' : 'jpg'}`,
};
}
throw new Error('Could not extract media URL from Pinterest');
} catch (error) {
throw new Error(
`Failed to scrape Pinterest: ${error instanceof Error ? error.message : String(error)}`
);
}
}
private async scrapeTenor(url: string): Promise<ScrapedMedia> {
try {
const idMatch = url.match(/\/view\/[^\/]+-(\d+)/);
if (!idMatch) {
throw new Error('Invalid Tenor URL format');
}
const gifId = idMatch[1];
const html = await fetch(url, {
redirect: 'follow',
headers: {
'User-Agent': this.USER_AGENT,
'Accept': 'text/html,application/xhtml+xml',
},
}).then((r) => r.text());
const mediaJson = this.extractTenorMediaJson(html);
const mediaVariants = this.collectTenorMedia(mediaJson);
const chosen = this.pickTenorVariant(mediaVariants, 'mp4', 'medium');
if (!chosen) {
throw new Error('Could not find suitable media variant');
}
return {
url: chosen.url,
type: 'gif',
source: 'tenor',
filename: `tenor_${gifId}.mp4`,
};
} catch (error) {
throw new Error(
`Failed to scrape Tenor: ${error instanceof Error ? error.message : String(error)}`
);
}
}
private extractTenorMediaJson(html: string): any {
const candidates: string[] = [];
const stateMatch = html.match(/window\.__STATE__\s*=\s*(\{[\s\S]*?\});/);
if (stateMatch?.[1]) {
candidates.push(stateMatch[1]);
}
const appConfigIdx = html.indexOf('{"appConfig"');
if (appConfigIdx !== -1) {
const slice = html.slice(appConfigIdx, appConfigIdx + 2_000_000);
const endTagIdx = slice.indexOf('</script>');
const jsonCandidate = endTagIdx !== -1 ? slice.slice(0, endTagIdx) : slice;
candidates.push(jsonCandidate);
}
const nextDataMatch = html.match(
/<script[^>]*id="__NEXT_DATA__"[^>]*>([\s\S]*?)<\/script>/
);
if (nextDataMatch?.[1]) {
candidates.push(nextDataMatch[1]);
}
for (const raw of candidates) {
try {
return JSON.parse(raw);
} catch {
const extracted = this.extractFirstJsonObject(raw);
if (extracted) {
try {
return JSON.parse(extracted);
} catch {
// ignore
}
}
}
}
throw new Error('Unable to locate Tenor media JSON in page');
}
private extractFirstJsonObject(s: string): string | null {
let depth = 0;
let inStr: string | null = null;
let escape = false;
let start = -1;
for (let i = 0; i < s.length; i++) {
const c = s[i];
if (inStr) {
if (escape) {
escape = false;
} else if (c === '\\') {
escape = true;
} else if (c === inStr) {
inStr = null;
}
continue;
} else {
if (c === '"' || c === "'") {
inStr = c;
continue;
}
if (c === '{') {
if (depth === 0) start = i;
depth++;
} else if (c === '}') {
depth--;
if (depth === 0 && start !== -1) {
return s.slice(start, i + 1);
}
}
}
}
return null;
}
private collectTenorMedia(data: any): Record<string, MediaVariant[]> {
const out: Record<string, MediaVariant[]> = {};
const formatKeys = new Set([
'gif',
'mediumgif',
'tinygif',
'nanogif',
'mp4',
'loopedmp4',
'tinymp4',
'nanomp4',
'webm',
'tinywebm',
'nanowebm',
'preview',
]);
const push = (format: string, url: string, meta?: Partial<MediaVariant>) => {
if (!out[format]) out[format] = [];
out[format].push({ url, ...meta });
};
const visit = (node: any) => {
if (!node) return;
if (Array.isArray(node)) {
for (const v of node) visit(v);
return;
}
if (typeof node === 'object') {
for (const k of Object.keys(node)) {
if (formatKeys.has(k) && node[k] && typeof node[k] === 'object') {
const v = node[k];
if (typeof v.url === 'string') {
push(k, v.url, {
width: typeof v.dims?.[0] === 'number' ? v.dims[0] : v.width,
height: typeof v.dims?.[1] === 'number' ? v.dims[1] : v.height,
});
} else if (typeof v === 'string') {
push(k, v);
}
}
}
for (const k of Object.keys(node)) {
visit(node[k]);
}
}
};
visit(data);
return out;
}
private pickTenorVariant(
media: Record<string, MediaVariant[]>,
prefFormat: 'gif' | 'mp4' | 'webm',
prefSize: 'original' | 'medium' | 'tiny' | 'nano'
): MediaVariant | null {
const sizeToFormats: Record<
'gif' | 'mp4' | 'webm',
Record<'original' | 'medium' | 'tiny' | 'nano', string[]>
> = {
gif: {
original: ['gif'],
medium: ['mediumgif', 'gif'],
tiny: ['tinygif', 'nanogif', 'gif'],
nano: ['nanogif', 'tinygif', 'gif'],
},
mp4: {
original: ['mp4', 'loopedmp4'],
medium: ['mp4', 'tinymp4'],
tiny: ['tinymp4', 'nanomp4', 'mp4'],
nano: ['nanomp4', 'tinymp4', 'mp4'],
},
webm: {
original: ['webm'],
medium: ['webm', 'tinywebm'],
tiny: ['tinywebm', 'nanowebm', 'webm'],
nano: ['nanowebm', 'tinywebm', 'webm'],
},
};
const order = sizeToFormats[prefFormat][prefSize];
for (const key of order) {
const list = media[key];
if (list?.length) {
const best = [...list].sort((a, b) => (b.width ?? 0) - (a.width ?? 0))[0];
return best;
}
}
const anyKeys = Object.keys(media).filter((k) => k.includes(prefFormat));
for (const k of anyKeys) {
const list = media[k];
if (list?.length) return list[0];
}
return null;
}
private extractPinterestId(url: string): string | null {
const match = url.match(/\/pin\/([^\/\?]+)/);
return match ? match[1] : null;
}
private async resolveRedirect(url: string): Promise<string | null> {
try {
const response = await fetch(url, {
redirect: 'manual',
headers: {
'User-Agent': this.USER_AGENT,
},
});
const location = response.headers.get('location');
return location;
} catch (error) {
console.warn('Failed to resolve redirect:', error);
return null;
}
}
private isDirectMediaUrl(url: string): boolean {
const mediaExtensions = [
'.jpg',
'.jpeg',
'.png',
'.gif',
'.webp',
'.avif',
'.mp4',
'.webm',
'.mov',
];
const lowerUrl = url.toLowerCase();
return mediaExtensions.some((ext) => lowerUrl.includes(ext));
}
private guessMediaType(url: string): 'image' | 'video' | 'gif' {
const lowerUrl = url.toLowerCase();
if (lowerUrl.includes('.gif')) return 'gif';
if (
lowerUrl.includes('.mp4') ||
lowerUrl.includes('.webm') ||
lowerUrl.includes('.mov')
)
return 'video';
return 'image';
}
private async checkMediaContentType(url: string): Promise<'image' | 'video' | 'gif' | null> {
try {
const response = await fetch(url, {
method: 'HEAD',
headers: {
'User-Agent': this.USER_AGENT,
},
});
if (!response.ok) {
return null;
}
const contentType = response.headers.get('content-type')?.toLowerCase() || '';
if (contentType === 'image/gif') return 'gif';
if (contentType.startsWith('image/')) return 'image';
if (contentType.startsWith('video/')) return 'video';
return null;
} catch (error) {
try {
const response = await fetch(url, {
method: 'GET',
headers: {
'User-Agent': this.USER_AGENT,
'Range': 'bytes=0-1023',
},
});
if (!response.ok) {
return null;
}
const contentType = response.headers.get('content-type')?.toLowerCase() || '';
if (contentType === 'image/gif') return 'gif';
if (contentType.startsWith('image/')) return 'image';
if (contentType.startsWith('video/')) return 'video';
return null;
} catch {
return null;
}
}
}
}
export const scraperService = new ScraperService();