feat: Сервис и контроллер сейвов с методами получения
This commit is contained in:
441
apps/backend/src/services/scraper.service.ts
Normal file
441
apps/backend/src/services/scraper.service.ts
Normal file
@ -0,0 +1,441 @@
|
||||
interface ScrapedMedia {
|
||||
url: string;
|
||||
title?: string;
|
||||
description?: string;
|
||||
type: 'image' | 'video' | 'gif';
|
||||
source: 'pinterest' | 'tenor' | 'direct';
|
||||
filename?: string;
|
||||
}
|
||||
|
||||
type MediaVariant = {
|
||||
url: string;
|
||||
width?: number;
|
||||
height?: number;
|
||||
label?: string
|
||||
};
|
||||
|
||||
class ScraperService {
|
||||
private readonly USER_AGENT =
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
|
||||
|
||||
async scrapeUrl(url: string): Promise<ScrapedMedia> {
|
||||
const urlObj = new URL(url);
|
||||
const hostname = urlObj.hostname.toLowerCase();
|
||||
|
||||
if (hostname.includes('pinterest.com') || hostname.includes('pin.it')) {
|
||||
return await this.scrapePinterest(url);
|
||||
}
|
||||
|
||||
if (hostname.includes('tenor.com')) {
|
||||
return await this.scrapeTenor(url);
|
||||
}
|
||||
|
||||
if (this.isDirectMediaUrl(url)) {
|
||||
return {
|
||||
url,
|
||||
type: this.guessMediaType(url),
|
||||
source: 'direct',
|
||||
};
|
||||
}
|
||||
|
||||
const mediaType = await this.checkMediaContentType(url);
|
||||
if (mediaType) {
|
||||
return {
|
||||
url,
|
||||
type: mediaType,
|
||||
source: 'direct',
|
||||
};
|
||||
}
|
||||
|
||||
throw new Error(`Unsupported URL: ${url}`);
|
||||
}
|
||||
|
||||
private async scrapePinterest(url: string): Promise<ScrapedMedia> {
|
||||
try {
|
||||
let pinId = this.extractPinterestId(url);
|
||||
|
||||
if (url.includes('pin.it/')) {
|
||||
const shortLink = url.split('pin.it/')[1].split('/')[0].split('?')[0];
|
||||
const redirectUrl = await this.resolveRedirect(
|
||||
`https://api.pinterest.com/url_shortener/${shortLink}/redirect/`
|
||||
);
|
||||
if (redirectUrl) {
|
||||
pinId = this.extractPinterestId(redirectUrl);
|
||||
}
|
||||
}
|
||||
|
||||
if (!pinId) {
|
||||
throw new Error('Could not extract Pinterest pin ID');
|
||||
}
|
||||
|
||||
if (pinId.includes('--')) {
|
||||
pinId = pinId.split('--')[1];
|
||||
}
|
||||
|
||||
const html = await fetch(`https://www.pinterest.com/pin/${pinId}/`, {
|
||||
headers: {
|
||||
'User-Agent': this.USER_AGENT,
|
||||
},
|
||||
}).then((r) => r.text());
|
||||
|
||||
const notFoundRegex = /"__typename"\s*:\s*"PinNotFound"/;
|
||||
if (notFoundRegex.test(html)) {
|
||||
throw new Error('Pinterest pin not found');
|
||||
}
|
||||
|
||||
const videoRegex = /"url":"(https:\/\/v1\.pinimg\.com\/videos\/.*?)"/g;
|
||||
const videoMatches = [...html.matchAll(videoRegex)];
|
||||
const videoLink = videoMatches
|
||||
.map(([, link]) => link)
|
||||
.find((a) => a.endsWith('.mp4'));
|
||||
|
||||
if (videoLink) {
|
||||
return {
|
||||
url: videoLink,
|
||||
type: 'video',
|
||||
source: 'pinterest',
|
||||
filename: `pinterest_${pinId}.mp4`,
|
||||
};
|
||||
}
|
||||
|
||||
const imageRegex = /src="(https:\/\/i\.pinimg\.com\/.*?\.(jpg|gif))"/g;
|
||||
const imageMatches = [...html.matchAll(imageRegex)];
|
||||
const imageLink = imageMatches
|
||||
.map(([, link]) => link)
|
||||
.find((a) => a.endsWith('.jpg') || a.endsWith('.gif'));
|
||||
|
||||
if (imageLink) {
|
||||
const isGif = imageLink.endsWith('.gif');
|
||||
return {
|
||||
url: imageLink,
|
||||
type: isGif ? 'gif' : 'image',
|
||||
source: 'pinterest',
|
||||
filename: `pinterest_${pinId}.${isGif ? 'gif' : 'jpg'}`,
|
||||
};
|
||||
}
|
||||
|
||||
throw new Error('Could not extract media URL from Pinterest');
|
||||
} catch (error) {
|
||||
throw new Error(
|
||||
`Failed to scrape Pinterest: ${error instanceof Error ? error.message : String(error)}`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
private async scrapeTenor(url: string): Promise<ScrapedMedia> {
|
||||
try {
|
||||
const idMatch = url.match(/\/view\/[^\/]+-(\d+)/);
|
||||
if (!idMatch) {
|
||||
throw new Error('Invalid Tenor URL format');
|
||||
}
|
||||
|
||||
const gifId = idMatch[1];
|
||||
|
||||
const html = await fetch(url, {
|
||||
redirect: 'follow',
|
||||
headers: {
|
||||
'User-Agent': this.USER_AGENT,
|
||||
'Accept': 'text/html,application/xhtml+xml',
|
||||
},
|
||||
}).then((r) => r.text());
|
||||
|
||||
const mediaJson = this.extractTenorMediaJson(html);
|
||||
const mediaVariants = this.collectTenorMedia(mediaJson);
|
||||
|
||||
const chosen = this.pickTenorVariant(mediaVariants, 'mp4', 'medium');
|
||||
|
||||
if (!chosen) {
|
||||
throw new Error('Could not find suitable media variant');
|
||||
}
|
||||
|
||||
return {
|
||||
url: chosen.url,
|
||||
type: 'gif',
|
||||
source: 'tenor',
|
||||
filename: `tenor_${gifId}.mp4`,
|
||||
};
|
||||
} catch (error) {
|
||||
throw new Error(
|
||||
`Failed to scrape Tenor: ${error instanceof Error ? error.message : String(error)}`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
private extractTenorMediaJson(html: string): any {
|
||||
const candidates: string[] = [];
|
||||
|
||||
const stateMatch = html.match(/window\.__STATE__\s*=\s*(\{[\s\S]*?\});/);
|
||||
if (stateMatch?.[1]) {
|
||||
candidates.push(stateMatch[1]);
|
||||
}
|
||||
|
||||
const appConfigIdx = html.indexOf('{"appConfig"');
|
||||
if (appConfigIdx !== -1) {
|
||||
const slice = html.slice(appConfigIdx, appConfigIdx + 2_000_000);
|
||||
const endTagIdx = slice.indexOf('</script>');
|
||||
const jsonCandidate = endTagIdx !== -1 ? slice.slice(0, endTagIdx) : slice;
|
||||
candidates.push(jsonCandidate);
|
||||
}
|
||||
|
||||
const nextDataMatch = html.match(
|
||||
/<script[^>]*id="__NEXT_DATA__"[^>]*>([\s\S]*?)<\/script>/
|
||||
);
|
||||
if (nextDataMatch?.[1]) {
|
||||
candidates.push(nextDataMatch[1]);
|
||||
}
|
||||
|
||||
for (const raw of candidates) {
|
||||
try {
|
||||
return JSON.parse(raw);
|
||||
} catch {
|
||||
const extracted = this.extractFirstJsonObject(raw);
|
||||
if (extracted) {
|
||||
try {
|
||||
return JSON.parse(extracted);
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error('Unable to locate Tenor media JSON in page');
|
||||
}
|
||||
|
||||
private extractFirstJsonObject(s: string): string | null {
|
||||
let depth = 0;
|
||||
let inStr: string | null = null;
|
||||
let escape = false;
|
||||
let start = -1;
|
||||
|
||||
for (let i = 0; i < s.length; i++) {
|
||||
const c = s[i];
|
||||
|
||||
if (inStr) {
|
||||
if (escape) {
|
||||
escape = false;
|
||||
} else if (c === '\\') {
|
||||
escape = true;
|
||||
} else if (c === inStr) {
|
||||
inStr = null;
|
||||
}
|
||||
continue;
|
||||
} else {
|
||||
if (c === '"' || c === "'") {
|
||||
inStr = c;
|
||||
continue;
|
||||
}
|
||||
if (c === '{') {
|
||||
if (depth === 0) start = i;
|
||||
depth++;
|
||||
} else if (c === '}') {
|
||||
depth--;
|
||||
if (depth === 0 && start !== -1) {
|
||||
return s.slice(start, i + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private collectTenorMedia(data: any): Record<string, MediaVariant[]> {
|
||||
const out: Record<string, MediaVariant[]> = {};
|
||||
|
||||
const formatKeys = new Set([
|
||||
'gif',
|
||||
'mediumgif',
|
||||
'tinygif',
|
||||
'nanogif',
|
||||
'mp4',
|
||||
'loopedmp4',
|
||||
'tinymp4',
|
||||
'nanomp4',
|
||||
'webm',
|
||||
'tinywebm',
|
||||
'nanowebm',
|
||||
'preview',
|
||||
]);
|
||||
|
||||
const push = (format: string, url: string, meta?: Partial<MediaVariant>) => {
|
||||
if (!out[format]) out[format] = [];
|
||||
out[format].push({ url, ...meta });
|
||||
};
|
||||
|
||||
const visit = (node: any) => {
|
||||
if (!node) return;
|
||||
if (Array.isArray(node)) {
|
||||
for (const v of node) visit(v);
|
||||
return;
|
||||
}
|
||||
if (typeof node === 'object') {
|
||||
for (const k of Object.keys(node)) {
|
||||
if (formatKeys.has(k) && node[k] && typeof node[k] === 'object') {
|
||||
const v = node[k];
|
||||
if (typeof v.url === 'string') {
|
||||
push(k, v.url, {
|
||||
width: typeof v.dims?.[0] === 'number' ? v.dims[0] : v.width,
|
||||
height: typeof v.dims?.[1] === 'number' ? v.dims[1] : v.height,
|
||||
});
|
||||
} else if (typeof v === 'string') {
|
||||
push(k, v);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (const k of Object.keys(node)) {
|
||||
visit(node[k]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
visit(data);
|
||||
return out;
|
||||
}
|
||||
|
||||
private pickTenorVariant(
|
||||
media: Record<string, MediaVariant[]>,
|
||||
prefFormat: 'gif' | 'mp4' | 'webm',
|
||||
prefSize: 'original' | 'medium' | 'tiny' | 'nano'
|
||||
): MediaVariant | null {
|
||||
const sizeToFormats: Record<
|
||||
'gif' | 'mp4' | 'webm',
|
||||
Record<'original' | 'medium' | 'tiny' | 'nano', string[]>
|
||||
> = {
|
||||
gif: {
|
||||
original: ['gif'],
|
||||
medium: ['mediumgif', 'gif'],
|
||||
tiny: ['tinygif', 'nanogif', 'gif'],
|
||||
nano: ['nanogif', 'tinygif', 'gif'],
|
||||
},
|
||||
mp4: {
|
||||
original: ['mp4', 'loopedmp4'],
|
||||
medium: ['mp4', 'tinymp4'],
|
||||
tiny: ['tinymp4', 'nanomp4', 'mp4'],
|
||||
nano: ['nanomp4', 'tinymp4', 'mp4'],
|
||||
},
|
||||
webm: {
|
||||
original: ['webm'],
|
||||
medium: ['webm', 'tinywebm'],
|
||||
tiny: ['tinywebm', 'nanowebm', 'webm'],
|
||||
nano: ['nanowebm', 'tinywebm', 'webm'],
|
||||
},
|
||||
};
|
||||
|
||||
const order = sizeToFormats[prefFormat][prefSize];
|
||||
for (const key of order) {
|
||||
const list = media[key];
|
||||
if (list?.length) {
|
||||
const best = [...list].sort((a, b) => (b.width ?? 0) - (a.width ?? 0))[0];
|
||||
return best;
|
||||
}
|
||||
}
|
||||
|
||||
const anyKeys = Object.keys(media).filter((k) => k.includes(prefFormat));
|
||||
for (const k of anyKeys) {
|
||||
const list = media[k];
|
||||
if (list?.length) return list[0];
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private extractPinterestId(url: string): string | null {
|
||||
const match = url.match(/\/pin\/([^\/\?]+)/);
|
||||
return match ? match[1] : null;
|
||||
}
|
||||
|
||||
private async resolveRedirect(url: string): Promise<string | null> {
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
redirect: 'manual',
|
||||
headers: {
|
||||
'User-Agent': this.USER_AGENT,
|
||||
},
|
||||
});
|
||||
|
||||
const location = response.headers.get('location');
|
||||
return location;
|
||||
} catch (error) {
|
||||
console.warn('Failed to resolve redirect:', error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private isDirectMediaUrl(url: string): boolean {
|
||||
const mediaExtensions = [
|
||||
'.jpg',
|
||||
'.jpeg',
|
||||
'.png',
|
||||
'.gif',
|
||||
'.webp',
|
||||
'.avif',
|
||||
'.mp4',
|
||||
'.webm',
|
||||
'.mov',
|
||||
];
|
||||
const lowerUrl = url.toLowerCase();
|
||||
return mediaExtensions.some((ext) => lowerUrl.includes(ext));
|
||||
}
|
||||
|
||||
private guessMediaType(url: string): 'image' | 'video' | 'gif' {
|
||||
const lowerUrl = url.toLowerCase();
|
||||
|
||||
if (lowerUrl.includes('.gif')) return 'gif';
|
||||
if (
|
||||
lowerUrl.includes('.mp4') ||
|
||||
lowerUrl.includes('.webm') ||
|
||||
lowerUrl.includes('.mov')
|
||||
)
|
||||
return 'video';
|
||||
return 'image';
|
||||
}
|
||||
|
||||
private async checkMediaContentType(url: string): Promise<'image' | 'video' | 'gif' | null> {
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
method: 'HEAD',
|
||||
headers: {
|
||||
'User-Agent': this.USER_AGENT,
|
||||
},
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const contentType = response.headers.get('content-type')?.toLowerCase() || '';
|
||||
|
||||
if (contentType === 'image/gif') return 'gif';
|
||||
if (contentType.startsWith('image/')) return 'image';
|
||||
if (contentType.startsWith('video/')) return 'video';
|
||||
|
||||
return null;
|
||||
} catch (error) {
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
'User-Agent': this.USER_AGENT,
|
||||
'Range': 'bytes=0-1023',
|
||||
},
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const contentType = response.headers.get('content-type')?.toLowerCase() || '';
|
||||
|
||||
if (contentType === 'image/gif') return 'gif';
|
||||
if (contentType.startsWith('image/')) return 'image';
|
||||
if (contentType.startsWith('video/')) return 'video';
|
||||
|
||||
return null;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export const scraperService = new ScraperService();
|
||||
Reference in New Issue
Block a user