442 lines
12 KiB
TypeScript
442 lines
12 KiB
TypeScript
interface ScrapedMedia {
|
|
url: string;
|
|
title?: string;
|
|
description?: string;
|
|
type: 'image' | 'video' | 'gif';
|
|
source: 'pinterest' | 'tenor' | 'direct';
|
|
filename?: string;
|
|
}
|
|
|
|
type MediaVariant = {
|
|
url: string;
|
|
width?: number;
|
|
height?: number;
|
|
label?: string
|
|
};
|
|
|
|
class ScraperService {
|
|
private readonly USER_AGENT =
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
|
|
|
|
async scrapeUrl(url: string): Promise<ScrapedMedia> {
|
|
const urlObj = new URL(url);
|
|
const hostname = urlObj.hostname.toLowerCase();
|
|
|
|
if (hostname.includes('pinterest.com') || hostname.includes('pin.it')) {
|
|
return await this.scrapePinterest(url);
|
|
}
|
|
|
|
if (hostname.includes('tenor.com')) {
|
|
return await this.scrapeTenor(url);
|
|
}
|
|
|
|
if (this.isDirectMediaUrl(url)) {
|
|
return {
|
|
url,
|
|
type: this.guessMediaType(url),
|
|
source: 'direct',
|
|
};
|
|
}
|
|
|
|
const mediaType = await this.checkMediaContentType(url);
|
|
if (mediaType) {
|
|
return {
|
|
url,
|
|
type: mediaType,
|
|
source: 'direct',
|
|
};
|
|
}
|
|
|
|
throw new Error(`Unsupported URL: ${url}`);
|
|
}
|
|
|
|
private async scrapePinterest(url: string): Promise<ScrapedMedia> {
|
|
try {
|
|
let pinId = this.extractPinterestId(url);
|
|
|
|
if (url.includes('pin.it/')) {
|
|
const shortLink = url.split('pin.it/')[1].split('/')[0].split('?')[0];
|
|
const redirectUrl = await this.resolveRedirect(
|
|
`https://api.pinterest.com/url_shortener/${shortLink}/redirect/`
|
|
);
|
|
if (redirectUrl) {
|
|
pinId = this.extractPinterestId(redirectUrl);
|
|
}
|
|
}
|
|
|
|
if (!pinId) {
|
|
throw new Error('Could not extract Pinterest pin ID');
|
|
}
|
|
|
|
if (pinId.includes('--')) {
|
|
pinId = pinId.split('--')[1];
|
|
}
|
|
|
|
const html = await fetch(`https://www.pinterest.com/pin/${pinId}/`, {
|
|
headers: {
|
|
'User-Agent': this.USER_AGENT,
|
|
},
|
|
}).then((r) => r.text());
|
|
|
|
const notFoundRegex = /"__typename"\s*:\s*"PinNotFound"/;
|
|
if (notFoundRegex.test(html)) {
|
|
throw new Error('Pinterest pin not found');
|
|
}
|
|
|
|
const videoRegex = /"url":"(https:\/\/v1\.pinimg\.com\/videos\/.*?)"/g;
|
|
const videoMatches = [...html.matchAll(videoRegex)];
|
|
const videoLink = videoMatches
|
|
.map(([, link]) => link)
|
|
.find((a) => a.endsWith('.mp4'));
|
|
|
|
if (videoLink) {
|
|
return {
|
|
url: videoLink,
|
|
type: 'video',
|
|
source: 'pinterest',
|
|
filename: `pinterest_${pinId}.mp4`,
|
|
};
|
|
}
|
|
|
|
const imageRegex = /src="(https:\/\/i\.pinimg\.com\/.*?\.(jpg|gif))"/g;
|
|
const imageMatches = [...html.matchAll(imageRegex)];
|
|
const imageLink = imageMatches
|
|
.map(([, link]) => link)
|
|
.find((a) => a.endsWith('.jpg') || a.endsWith('.gif'));
|
|
|
|
if (imageLink) {
|
|
const isGif = imageLink.endsWith('.gif');
|
|
return {
|
|
url: imageLink,
|
|
type: isGif ? 'gif' : 'image',
|
|
source: 'pinterest',
|
|
filename: `pinterest_${pinId}.${isGif ? 'gif' : 'jpg'}`,
|
|
};
|
|
}
|
|
|
|
throw new Error('Could not extract media URL from Pinterest');
|
|
} catch (error) {
|
|
throw new Error(
|
|
`Failed to scrape Pinterest: ${error instanceof Error ? error.message : String(error)}`
|
|
);
|
|
}
|
|
}
|
|
|
|
private async scrapeTenor(url: string): Promise<ScrapedMedia> {
|
|
try {
|
|
const idMatch = url.match(/\/view\/[^\/]+-(\d+)/);
|
|
if (!idMatch) {
|
|
throw new Error('Invalid Tenor URL format');
|
|
}
|
|
|
|
const gifId = idMatch[1];
|
|
|
|
const html = await fetch(url, {
|
|
redirect: 'follow',
|
|
headers: {
|
|
'User-Agent': this.USER_AGENT,
|
|
'Accept': 'text/html,application/xhtml+xml',
|
|
},
|
|
}).then((r) => r.text());
|
|
|
|
const mediaJson = this.extractTenorMediaJson(html);
|
|
const mediaVariants = this.collectTenorMedia(mediaJson);
|
|
|
|
const chosen = this.pickTenorVariant(mediaVariants, 'mp4', 'medium');
|
|
|
|
if (!chosen) {
|
|
throw new Error('Could not find suitable media variant');
|
|
}
|
|
|
|
return {
|
|
url: chosen.url,
|
|
type: 'gif',
|
|
source: 'tenor',
|
|
filename: `tenor_${gifId}.mp4`,
|
|
};
|
|
} catch (error) {
|
|
throw new Error(
|
|
`Failed to scrape Tenor: ${error instanceof Error ? error.message : String(error)}`
|
|
);
|
|
}
|
|
}
|
|
|
|
private extractTenorMediaJson(html: string): any {
|
|
const candidates: string[] = [];
|
|
|
|
const stateMatch = html.match(/window\.__STATE__\s*=\s*(\{[\s\S]*?\});/);
|
|
if (stateMatch?.[1]) {
|
|
candidates.push(stateMatch[1]);
|
|
}
|
|
|
|
const appConfigIdx = html.indexOf('{"appConfig"');
|
|
if (appConfigIdx !== -1) {
|
|
const slice = html.slice(appConfigIdx, appConfigIdx + 2_000_000);
|
|
const endTagIdx = slice.indexOf('</script>');
|
|
const jsonCandidate = endTagIdx !== -1 ? slice.slice(0, endTagIdx) : slice;
|
|
candidates.push(jsonCandidate);
|
|
}
|
|
|
|
const nextDataMatch = html.match(
|
|
/<script[^>]*id="__NEXT_DATA__"[^>]*>([\s\S]*?)<\/script>/
|
|
);
|
|
if (nextDataMatch?.[1]) {
|
|
candidates.push(nextDataMatch[1]);
|
|
}
|
|
|
|
for (const raw of candidates) {
|
|
try {
|
|
return JSON.parse(raw);
|
|
} catch {
|
|
const extracted = this.extractFirstJsonObject(raw);
|
|
if (extracted) {
|
|
try {
|
|
return JSON.parse(extracted);
|
|
} catch {
|
|
// ignore
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
throw new Error('Unable to locate Tenor media JSON in page');
|
|
}
|
|
|
|
private extractFirstJsonObject(s: string): string | null {
|
|
let depth = 0;
|
|
let inStr: string | null = null;
|
|
let escape = false;
|
|
let start = -1;
|
|
|
|
for (let i = 0; i < s.length; i++) {
|
|
const c = s[i];
|
|
|
|
if (inStr) {
|
|
if (escape) {
|
|
escape = false;
|
|
} else if (c === '\\') {
|
|
escape = true;
|
|
} else if (c === inStr) {
|
|
inStr = null;
|
|
}
|
|
continue;
|
|
} else {
|
|
if (c === '"' || c === "'") {
|
|
inStr = c;
|
|
continue;
|
|
}
|
|
if (c === '{') {
|
|
if (depth === 0) start = i;
|
|
depth++;
|
|
} else if (c === '}') {
|
|
depth--;
|
|
if (depth === 0 && start !== -1) {
|
|
return s.slice(start, i + 1);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
private collectTenorMedia(data: any): Record<string, MediaVariant[]> {
|
|
const out: Record<string, MediaVariant[]> = {};
|
|
|
|
const formatKeys = new Set([
|
|
'gif',
|
|
'mediumgif',
|
|
'tinygif',
|
|
'nanogif',
|
|
'mp4',
|
|
'loopedmp4',
|
|
'tinymp4',
|
|
'nanomp4',
|
|
'webm',
|
|
'tinywebm',
|
|
'nanowebm',
|
|
'preview',
|
|
]);
|
|
|
|
const push = (format: string, url: string, meta?: Partial<MediaVariant>) => {
|
|
if (!out[format]) out[format] = [];
|
|
out[format].push({ url, ...meta });
|
|
};
|
|
|
|
const visit = (node: any) => {
|
|
if (!node) return;
|
|
if (Array.isArray(node)) {
|
|
for (const v of node) visit(v);
|
|
return;
|
|
}
|
|
if (typeof node === 'object') {
|
|
for (const k of Object.keys(node)) {
|
|
if (formatKeys.has(k) && node[k] && typeof node[k] === 'object') {
|
|
const v = node[k];
|
|
if (typeof v.url === 'string') {
|
|
push(k, v.url, {
|
|
width: typeof v.dims?.[0] === 'number' ? v.dims[0] : v.width,
|
|
height: typeof v.dims?.[1] === 'number' ? v.dims[1] : v.height,
|
|
});
|
|
} else if (typeof v === 'string') {
|
|
push(k, v);
|
|
}
|
|
}
|
|
}
|
|
for (const k of Object.keys(node)) {
|
|
visit(node[k]);
|
|
}
|
|
}
|
|
};
|
|
|
|
visit(data);
|
|
return out;
|
|
}
|
|
|
|
private pickTenorVariant(
|
|
media: Record<string, MediaVariant[]>,
|
|
prefFormat: 'gif' | 'mp4' | 'webm',
|
|
prefSize: 'original' | 'medium' | 'tiny' | 'nano'
|
|
): MediaVariant | null {
|
|
const sizeToFormats: Record<
|
|
'gif' | 'mp4' | 'webm',
|
|
Record<'original' | 'medium' | 'tiny' | 'nano', string[]>
|
|
> = {
|
|
gif: {
|
|
original: ['gif'],
|
|
medium: ['mediumgif', 'gif'],
|
|
tiny: ['tinygif', 'nanogif', 'gif'],
|
|
nano: ['nanogif', 'tinygif', 'gif'],
|
|
},
|
|
mp4: {
|
|
original: ['mp4', 'loopedmp4'],
|
|
medium: ['mp4', 'tinymp4'],
|
|
tiny: ['tinymp4', 'nanomp4', 'mp4'],
|
|
nano: ['nanomp4', 'tinymp4', 'mp4'],
|
|
},
|
|
webm: {
|
|
original: ['webm'],
|
|
medium: ['webm', 'tinywebm'],
|
|
tiny: ['tinywebm', 'nanowebm', 'webm'],
|
|
nano: ['nanowebm', 'tinywebm', 'webm'],
|
|
},
|
|
};
|
|
|
|
const order = sizeToFormats[prefFormat][prefSize];
|
|
for (const key of order) {
|
|
const list = media[key];
|
|
if (list?.length) {
|
|
const best = [...list].sort((a, b) => (b.width ?? 0) - (a.width ?? 0))[0];
|
|
return best;
|
|
}
|
|
}
|
|
|
|
const anyKeys = Object.keys(media).filter((k) => k.includes(prefFormat));
|
|
for (const k of anyKeys) {
|
|
const list = media[k];
|
|
if (list?.length) return list[0];
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
private extractPinterestId(url: string): string | null {
|
|
const match = url.match(/\/pin\/([^\/\?]+)/);
|
|
return match ? match[1] : null;
|
|
}
|
|
|
|
private async resolveRedirect(url: string): Promise<string | null> {
|
|
try {
|
|
const response = await fetch(url, {
|
|
redirect: 'manual',
|
|
headers: {
|
|
'User-Agent': this.USER_AGENT,
|
|
},
|
|
});
|
|
|
|
const location = response.headers.get('location');
|
|
return location;
|
|
} catch (error) {
|
|
console.warn('Failed to resolve redirect:', error);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
private isDirectMediaUrl(url: string): boolean {
|
|
const mediaExtensions = [
|
|
'.jpg',
|
|
'.jpeg',
|
|
'.png',
|
|
'.gif',
|
|
'.webp',
|
|
'.avif',
|
|
'.mp4',
|
|
'.webm',
|
|
'.mov',
|
|
];
|
|
const lowerUrl = url.toLowerCase();
|
|
return mediaExtensions.some((ext) => lowerUrl.includes(ext));
|
|
}
|
|
|
|
private guessMediaType(url: string): 'image' | 'video' | 'gif' {
|
|
const lowerUrl = url.toLowerCase();
|
|
|
|
if (lowerUrl.includes('.gif')) return 'gif';
|
|
if (
|
|
lowerUrl.includes('.mp4') ||
|
|
lowerUrl.includes('.webm') ||
|
|
lowerUrl.includes('.mov')
|
|
)
|
|
return 'video';
|
|
return 'image';
|
|
}
|
|
|
|
private async checkMediaContentType(url: string): Promise<'image' | 'video' | 'gif' | null> {
|
|
try {
|
|
const response = await fetch(url, {
|
|
method: 'HEAD',
|
|
headers: {
|
|
'User-Agent': this.USER_AGENT,
|
|
},
|
|
});
|
|
|
|
if (!response.ok) {
|
|
return null;
|
|
}
|
|
|
|
const contentType = response.headers.get('content-type')?.toLowerCase() || '';
|
|
|
|
if (contentType === 'image/gif') return 'gif';
|
|
if (contentType.startsWith('image/')) return 'image';
|
|
if (contentType.startsWith('video/')) return 'video';
|
|
|
|
return null;
|
|
} catch (error) {
|
|
try {
|
|
const response = await fetch(url, {
|
|
method: 'GET',
|
|
headers: {
|
|
'User-Agent': this.USER_AGENT,
|
|
'Range': 'bytes=0-1023',
|
|
},
|
|
});
|
|
|
|
if (!response.ok) {
|
|
return null;
|
|
}
|
|
|
|
const contentType = response.headers.get('content-type')?.toLowerCase() || '';
|
|
|
|
if (contentType === 'image/gif') return 'gif';
|
|
if (contentType.startsWith('image/')) return 'image';
|
|
if (contentType.startsWith('video/')) return 'video';
|
|
|
|
return null;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
export const scraperService = new ScraperService();
|