app/apps/backend/src/services/scraper.service.ts

interface ScrapedMedia {
  url: string;
  title?: string;
  description?: string;
  type: 'image' | 'video' | 'gif';
  source: 'pinterest' | 'tenor' | 'direct';
  filename?: string;
}

type MediaVariant = {
  url: string;
  width?: number;
  height?: number;
  label?: string
};

class ScraperService {
  private readonly USER_AGENT =
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';

  async scrapeUrl(url: string): Promise<ScrapedMedia> {
    const urlObj = new URL(url);
    const hostname = urlObj.hostname.toLowerCase();

    if (hostname.includes('pinterest.com') || hostname.includes('pin.it')) {
      return await this.scrapePinterest(url);
    }

    if (hostname.includes('tenor.com')) {
      return await this.scrapeTenor(url);
    }

    if (this.isDirectMediaUrl(url)) {
      return {
        url,
        type: this.guessMediaType(url),
        source: 'direct',
      };
    }

    const mediaType = await this.checkMediaContentType(url);
    if (mediaType) {
      return {
        url,
        type: mediaType,
        source: 'direct',
      };
    }

    throw new Error(`Unsupported URL: ${url}`);
  }

  private async scrapePinterest(url: string): Promise<ScrapedMedia> {
    try {
      let pinId = this.extractPinterestId(url);

      if (url.includes('pin.it/')) {
        const shortLink = url.split('pin.it/')[1].split('/')[0].split('?')[0];
        const redirectUrl = await this.resolveRedirect(
          `https://api.pinterest.com/url_shortener/${shortLink}/redirect/`
        );
        if (redirectUrl) {
          pinId = this.extractPinterestId(redirectUrl);
        }
      }

      if (!pinId) {
        throw new Error('Could not extract Pinterest pin ID');
      }

      if (pinId.includes('--')) {
        pinId = pinId.split('--')[1];
      }

      const html = await fetch(`https://www.pinterest.com/pin/${pinId}/`, {
        headers: {
          'User-Agent': this.USER_AGENT,
        },
      }).then((r) => r.text());

      const notFoundRegex = /"__typename"\s*:\s*"PinNotFound"/;
      if (notFoundRegex.test(html)) {
        throw new Error('Pinterest pin not found');
      }

      const videoRegex = /"url":"(https:\/\/v1\.pinimg\.com\/videos\/.*?)"/g;
      const videoMatches = [...html.matchAll(videoRegex)];
      const videoLink = videoMatches
        .map(([, link]) => link)
        .find((a) => a.endsWith('.mp4'));

      if (videoLink) {
        return {
          url: videoLink,
          type: 'video',
          source: 'pinterest',
          filename: `pinterest_${pinId}.mp4`,
        };
      }

      const imageRegex = /src="(https:\/\/i\.pinimg\.com\/.*?\.(jpg|gif))"/g;
      const imageMatches = [...html.matchAll(imageRegex)];
      const imageLink = imageMatches
        .map(([, link]) => link)
        .find((a) => a.endsWith('.jpg') || a.endsWith('.gif'));

      if (imageLink) {
        const isGif = imageLink.endsWith('.gif');
        return {
          url: imageLink,
          type: isGif ? 'gif' : 'image',
          source: 'pinterest',
          filename: `pinterest_${pinId}.${isGif ? 'gif' : 'jpg'}`,
        };
      }

      throw new Error('Could not extract media URL from Pinterest');
    } catch (error) {
      throw new Error(
        `Failed to scrape Pinterest: ${error instanceof Error ? error.message : String(error)}`
      );
    }
  }

  private async scrapeTenor(url: string): Promise<ScrapedMedia> {
    try {
      const idMatch = url.match(/\/view\/[^\/]+-(\d+)/);
      if (!idMatch) {
        throw new Error('Invalid Tenor URL format');
      }

      const gifId = idMatch[1];

      const html = await fetch(url, {
        redirect: 'follow',
        headers: {
          'User-Agent': this.USER_AGENT,
          'Accept': 'text/html,application/xhtml+xml',
        },
      }).then((r) => r.text());

      const mediaJson = this.extractTenorMediaJson(html);
      const mediaVariants = this.collectTenorMedia(mediaJson);

      const chosen = this.pickTenorVariant(mediaVariants, 'mp4', 'medium');

      if (!chosen) {
        throw new Error('Could not find suitable media variant');
      }

      return {
        url: chosen.url,
        type: 'gif',
        source: 'tenor',
        filename: `tenor_${gifId}.mp4`,
      };
    } catch (error) {
      throw new Error(
        `Failed to scrape Tenor: ${error instanceof Error ? error.message : String(error)}`
      );
    }
  }

  private extractTenorMediaJson(html: string): any {
    const candidates: string[] = [];

    const stateMatch = html.match(/window\.__STATE__\s*=\s*(\{[\s\S]*?\});/);
    if (stateMatch?.[1]) {
      candidates.push(stateMatch[1]);
    }

    const appConfigIdx = html.indexOf('{"appConfig"');
    if (appConfigIdx !== -1) {
      const slice = html.slice(appConfigIdx, appConfigIdx + 2_000_000);
      const endTagIdx = slice.indexOf('</script>');
      const jsonCandidate = endTagIdx !== -1 ? slice.slice(0, endTagIdx) : slice;
      candidates.push(jsonCandidate);
    }

    const nextDataMatch = html.match(
      /<script[^>]*id="__NEXT_DATA__"[^>]*>([\s\S]*?)<\/script>/
    );
    if (nextDataMatch?.[1]) {
      candidates.push(nextDataMatch[1]);
    }

    for (const raw of candidates) {
      try {
        return JSON.parse(raw);
      } catch {
        const extracted = this.extractFirstJsonObject(raw);
        if (extracted) {
          try {
            return JSON.parse(extracted);
          } catch {
            // ignore
          }
        }
      }
    }

    throw new Error('Unable to locate Tenor media JSON in page');
  }

  private extractFirstJsonObject(s: string): string | null {
    let depth = 0;
    let inStr: string | null = null;
    let escape = false;
    let start = -1;

    for (let i = 0; i < s.length; i++) {
      const c = s[i];

      if (inStr) {
        if (escape) {
          escape = false;
        } else if (c === '\\') {
          escape = true;
        } else if (c === inStr) {
          inStr = null;
        }
        continue;
      } else {
        if (c === '"' || c === "'") {
          inStr = c;
          continue;
        }
        if (c === '{') {
          if (depth === 0) start = i;
          depth++;
        } else if (c === '}') {
          depth--;
          if (depth === 0 && start !== -1) {
            return s.slice(start, i + 1);
          }
        }
      }
    }
    return null;
  }

  private collectTenorMedia(data: any): Record<string, MediaVariant[]> {
    const out: Record<string, MediaVariant[]> = {};

    const formatKeys = new Set([
      'gif',
      'mediumgif',
      'tinygif',
      'nanogif',
      'mp4',
      'loopedmp4',
      'tinymp4',
      'nanomp4',
      'webm',
      'tinywebm',
      'nanowebm',
      'preview',
    ]);

    const push = (format: string, url: string, meta?: Partial<MediaVariant>) => {
      if (!out[format]) out[format] = [];
      out[format].push({ url, ...meta });
    };

    const visit = (node: any) => {
      if (!node) return;
      if (Array.isArray(node)) {
        for (const v of node) visit(v);
        return;
      }
      if (typeof node === 'object') {
        for (const k of Object.keys(node)) {
          if (formatKeys.has(k) && node[k] && typeof node[k] === 'object') {
            const v = node[k];
            if (typeof v.url === 'string') {
              push(k, v.url, {
                width: typeof v.dims?.[0] === 'number' ? v.dims[0] : v.width,
                height: typeof v.dims?.[1] === 'number' ? v.dims[1] : v.height,
              });
            } else if (typeof v === 'string') {
              push(k, v);
            }
          }
        }
        for (const k of Object.keys(node)) {
          visit(node[k]);
        }
      }
    };

    visit(data);
    return out;
  }

  private pickTenorVariant(
    media: Record<string, MediaVariant[]>,
    prefFormat: 'gif' | 'mp4' | 'webm',
    prefSize: 'original' | 'medium' | 'tiny' | 'nano'
  ): MediaVariant | null {
    const sizeToFormats: Record<
      'gif' | 'mp4' | 'webm',
      Record<'original' | 'medium' | 'tiny' | 'nano', string[]>
    > = {
      gif: {
        original: ['gif'],
        medium: ['mediumgif', 'gif'],
        tiny: ['tinygif', 'nanogif', 'gif'],
        nano: ['nanogif', 'tinygif', 'gif'],
      },
      mp4: {
        original: ['mp4', 'loopedmp4'],
        medium: ['mp4', 'tinymp4'],
        tiny: ['tinymp4', 'nanomp4', 'mp4'],
        nano: ['nanomp4', 'tinymp4', 'mp4'],
      },
      webm: {
        original: ['webm'],
        medium: ['webm', 'tinywebm'],
        tiny: ['tinywebm', 'nanowebm', 'webm'],
        nano: ['nanowebm', 'tinywebm', 'webm'],
      },
    };

    const order = sizeToFormats[prefFormat][prefSize];
    for (const key of order) {
      const list = media[key];
      if (list?.length) {
        const best = [...list].sort((a, b) => (b.width ?? 0) - (a.width ?? 0))[0];
        return best;
      }
    }

    const anyKeys = Object.keys(media).filter((k) => k.includes(prefFormat));
    for (const k of anyKeys) {
      const list = media[k];
      if (list?.length) return list[0];
    }

    return null;
  }

  private extractPinterestId(url: string): string | null {
    const match = url.match(/\/pin\/([^\/\?]+)/);
    return match ? match[1] : null;
  }

  private async resolveRedirect(url: string): Promise<string | null> {
    try {
      const response = await fetch(url, {
        redirect: 'manual',
        headers: {
          'User-Agent': this.USER_AGENT,
        },
      });

      const location = response.headers.get('location');
      return location;
    } catch (error) {
      console.warn('Failed to resolve redirect:', error);
      return null;
    }
  }

  private isDirectMediaUrl(url: string): boolean {
    const mediaExtensions = [
      '.jpg',
      '.jpeg',
      '.png',
      '.gif',
      '.webp',
      '.avif',
      '.mp4',
      '.webm',
      '.mov',
    ];
    const lowerUrl = url.toLowerCase();
    return mediaExtensions.some((ext) => lowerUrl.includes(ext));
  }

  private guessMediaType(url: string): 'image' | 'video' | 'gif' {
    const lowerUrl = url.toLowerCase();

    if (lowerUrl.includes('.gif')) return 'gif';
    if (
      lowerUrl.includes('.mp4') ||
      lowerUrl.includes('.webm') ||
      lowerUrl.includes('.mov')
    )
      return 'video';
    return 'image';
  }

  private async checkMediaContentType(url: string): Promise<'image' | 'video' | 'gif' | null> {
    try {
      const response = await fetch(url, {
        method: 'HEAD',
        headers: {
          'User-Agent': this.USER_AGENT,
        },
      });

      if (!response.ok) {
        return null;
      }

      const contentType = response.headers.get('content-type')?.toLowerCase() || '';

      if (contentType === 'image/gif') return 'gif';
      if (contentType.startsWith('image/')) return 'image';
      if (contentType.startsWith('video/')) return 'video';

      return null;
    } catch (error) {
      try {
        const response = await fetch(url, {
          method: 'GET',
          headers: {
            'User-Agent': this.USER_AGENT,
            'Range': 'bytes=0-1023',
          },
        });

        if (!response.ok) {
          return null;
        }

        const contentType = response.headers.get('content-type')?.toLowerCase() || '';

        if (contentType === 'image/gif') return 'gif';
        if (contentType.startsWith('image/')) return 'image';
        if (contentType.startsWith('video/')) return 'video';

        return null;
      } catch {
        return null;
      }
    }
  }
}

export const scraperService = new ScraperService();