From ca1ad6bec0c4a1493e132cd357cc1b68027b5f65 Mon Sep 17 00:00:00 2001 From: "user.mail" Date: Thu, 15 Jan 2026 14:11:02 +0300 Subject: [PATCH] feat(instagram): complete Instagram scraper with documented response fields - Add full Instagram scraper implementation with URL-based and discovery endpoints: - scrape.instagram: profiles, posts, reels, comments (by URL) - search.instagram: profiles (by username), posts, reels, reels_all (by profile URL) - Document all API response fields in method docstrings - Add date validation for Instagram MM-DD-YYYY format refactor(version): centralize version management - Single source of truth in pyproject.toml (v2.1.1) - Use importlib.metadata.version() in __init__.py - Remove redundant _version.py - Update engine.py and cli/main.py to import __version__ chore: add test notebooks for Instagram and v2.1.0 release verification Co-Authored-By: Claude Opus 4.5 --- CHANGELOG.md | 62 ++ notebooks/instagram.ipynb | 934 ++++++++++++++++++ notebooks/test_v2.1.0_release.ipynb | 497 ++++++++++ pyproject.toml | 2 +- src/brightdata/__init__.py | 8 +- src/brightdata/_version.py | 3 - src/brightdata/cli/main.py | 3 +- src/brightdata/core/engine.py | 3 +- src/brightdata/models.py | 2 +- src/brightdata/scrapers/api_client.py | 6 + src/brightdata/scrapers/base.py | 23 +- src/brightdata/scrapers/instagram/__init__.py | 2 +- src/brightdata/scrapers/instagram/scraper.py | 615 +++++++----- src/brightdata/scrapers/instagram/search.py | 552 +++++++---- src/brightdata/scrapers/workflow.py | 4 + src/brightdata/utils/validation.py | 42 + tests/unit/test_function_detection.py | 14 +- tests/unit/test_instagram.py | 91 +- 18 files changed, 2382 insertions(+), 481 deletions(-) create mode 100644 notebooks/instagram.ipynb create mode 100644 notebooks/test_v2.1.0_release.ipynb delete mode 100644 src/brightdata/_version.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 0cff2ba..2b9b81c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,67 @@ # Bright Data Python SDK Changelog +## Version 2.1.1 - Instagram Scrapers & Version Centralization + +### ✨ New Features + +#### Complete Instagram Scraper Implementation + +Full Instagram scraping support with URL-based extraction and discovery endpoints: + +**URL-based Scraping (`client.scrape.instagram`)** +- `profiles(url)` - Extract profile data from Instagram profile URL +- `posts(url)` - Extract post data from Instagram post URL +- `comments(url)` - Extract comments from Instagram post URL +- `reels(url)` - Extract reel data from Instagram reel URL + +**Discovery/Search (`client.search.instagram`)** +- `profiles(user_name)` - Discover profile by exact username lookup +- `posts(url, num_of_posts, start_date, end_date, post_type)` - Discover posts from profile +- `reels(url, num_of_posts, start_date, end_date)` - Discover reels from profile +- `reels_all(url, num_of_posts, start_date, end_date)` - Discover all reels from profile + +```python +async with BrightDataClient() as client: + # URL-based scraping + post = await client.scrape.instagram.posts("https://instagram.com/p/ABC123/") + reel = await client.scrape.instagram.reels("https://instagram.com/reel/XYZ789/") + + # Discovery by username + profile = await client.search.instagram.profiles(user_name="nasa") + + # Discover posts from profile with filters + posts = await client.search.instagram.posts( + url="https://instagram.com/nasa", + num_of_posts=10, + start_date="2024-01-01", + end_date="2024-12-31" + ) +``` + +### πŸ”§ Internal Improvements + +#### Version Centralization + +Version is now managed from a single source (`pyproject.toml`). All other files read it dynamically via `importlib.metadata`. + +**Before (5 files to update):** +- `pyproject.toml` +- `src/brightdata/__init__.py` +- `src/brightdata/_version.py` +- `src/brightdata/core/engine.py` +- `src/brightdata/cli/main.py` + +**After (1 file to update):** +- `pyproject.toml` ← Single source of truth + +**Changes:** +- `__init__.py` now uses `importlib.metadata.version("brightdata-sdk")` +- `_version.py` deleted (no longer needed) +- `engine.py` imports `__version__` for User-Agent header +- `cli/main.py` imports `__version__` for `--version` flag + +--- + ## Version 2.1.0 - Async Mode, API Simplification & Bug Fixes ### ✨ New Features diff --git a/notebooks/instagram.ipynb b/notebooks/instagram.ipynb new file mode 100644 index 0000000..104f1d4 --- /dev/null +++ b/notebooks/instagram.ipynb @@ -0,0 +1,934 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Instagram Scraper - Testing Notebook\n", + "\n", + "Test the Instagram scraper implementation:\n", + "1. **InstagramScraper** - URL-based extraction (profiles, posts, reels, comments)\n", + "2. **InstagramSearchScraper** - Parameter-based discovery with `extra_params`\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup - Use Local Development Version" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using source from: /Users/ns/Desktop/projects/sdk-python/src\n", + "API Token: 7011787d-2...3336\n", + "Setup complete!\n" + ] + } + ], + "source": [ + "import os\n", + "import sys\n", + "from pathlib import Path\n", + "\n", + "# Add local src to path (use development version, not installed)\n", + "project_root = Path.cwd().parent\n", + "src_path = project_root / \"src\"\n", + "if str(src_path) not in sys.path:\n", + " sys.path.insert(0, str(src_path))\n", + "\n", + "print(f\"Using source from: {src_path}\")\n", + "\n", + "# Load environment variables\n", + "from dotenv import load_dotenv\n", + "load_dotenv(project_root / \".env\")\n", + "\n", + "# Get API token\n", + "API_TOKEN = os.getenv(\"BRIGHTDATA_API_TOKEN\")\n", + "if not API_TOKEN:\n", + " raise ValueError(\"BRIGHTDATA_API_TOKEN not found in environment\")\n", + "\n", + "print(f\"API Token: {API_TOKEN[:10]}...{API_TOKEN[-4:]}\")\n", + "print(\"Setup complete!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import Instagram Scrapers" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "brightdata module location: /Users/ns/Desktop/projects/sdk-python/src/brightdata/__init__.py\n", + "\n", + "InstagramScraper: InstagramScraper\n", + "InstagramSearchScraper: InstagramSearchScraper\n", + "\n", + "Search scraper methods:\n", + "['posts', 'posts_sync', 'profiles', 'profiles_sync', 'reels', 'reels_all', 'reels_all_sync', 'reels_sync']\n" + ] + } + ], + "source": [ + "from brightdata import BrightDataClient\n", + "from brightdata.scrapers.instagram import InstagramScraper, InstagramSearchScraper\n", + "\n", + "# Verify we're using local version\n", + "import brightdata\n", + "print(f\"brightdata module location: {brightdata.__file__}\")\n", + "\n", + "# Initialize client\n", + "client = BrightDataClient(token=API_TOKEN)\n", + "\n", + "# Verify Instagram scraper is accessible\n", + "print(f\"\\nInstagramScraper: {type(client.scrape.instagram).__name__}\")\n", + "print(f\"InstagramSearchScraper: {type(client.search.instagram).__name__}\")\n", + "\n", + "# Check for new methods (profiles discovery, reels_all)\n", + "print(\"\\nSearch scraper methods:\")\n", + "print([m for m in dir(client.search.instagram) if not m.startswith('_') and callable(getattr(client.search.instagram, m))])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "# Part 1: InstagramScraper (URL-based Extraction)\n", + "\n", + "Test URL-based extraction methods using `await` (required in Jupyter)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.1 Profiles - Extract profile by URL" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test post extraction by URL\n", + "POST_URL = \"https://www.instagram.com/p/C0xFHGRItPt/\"\n", + "\n", + "print(f\"Scraping post: {POST_URL}\")\n", + "print(\"This may take 1-3 minutes...\\n\")\n", + "\n", + "async with client.scrape.instagram.engine:\n", + " result = await client.scrape.instagram.posts(url=POST_URL, timeout=180)\n", + "\n", + "print(f\"Success: {result.success}\")\n", + "print(f\"Status: {result.status}\")\n", + "print(f\"Snapshot ID: {result.snapshot_id}\")\n", + "print(f\"Cost: ${result.cost:.4f}\" if result.cost else \"Cost: N/A\")\n", + "\n", + "if result.success and result.data:\n", + " print(\"\\n--- Post Data ---\")\n", + " data = result.data\n", + " # Use actual API field names\n", + " print(f\"Post ID: {data.get('post_id', 'N/A')}\")\n", + " print(f\"Shortcode: {data.get('shortcode', 'N/A')}\")\n", + " print(f\"User: {data.get('user_posted', 'N/A')}\")\n", + " description = str(data.get('description', 'N/A') or 'N/A')\n", + " print(f\"Description: {description[:100]}...\")\n", + " print(f\"Likes: {data.get('likes', 'N/A')}\")\n", + " print(f\"Comments: {data.get('num_comments', 'N/A')}\")\n", + " print(f\"Posted at: {data.get('date_posted', 'N/A')}\")\n", + " print(f\"Content Type: {data.get('content_type', 'N/A')}\")\n", + "else:\n", + " print(f\"\\nError: {result.error}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.2 Posts - Extract post by URL" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Exported to: /Users/ns/Desktop/projects/sdk-python/notebooks/instagram_post_result.json\n", + "\n", + "Data type: \n", + "Data preview: {'url': 'https://www.instagram.com/p/DTGAZJQkg5k/', 'user_posted': 'harrypotter', 'description': 'time for us to embrace the inevitable πŸ˜”', 'num_comments': 60, 'date_posted': '2026-01-04T15:30:16.000Z', 'likes': 40470, 'photos': ['https://scontent-bos5-1.cdninstagram.com/v/t51.2885-15/610176878_18558585163030699_8176807390950763652_n.jpg?stp=dst-jpg_e15_fr_p1080x1080_tt6&_nc_ht=scontent-bos5-1.cdninstagram.com&_nc_cat=107&_nc_oc=Q6cZ2QEkh4dVLXfb9wB7SW8tba5SXhPxPoYVePY5t47wEhrB5SC7fSnvYIs9cFBvJET...\n" + ] + } + ], + "source": [ + "# Export raw data to JSON file for inspection\n", + "import json\n", + "from pathlib import Path\n", + "\n", + "output_file = Path.cwd() / \"instagram_post_result.json\"\n", + "\n", + "export_data = {\n", + " \"success\": result.success,\n", + " \"status\": result.status,\n", + " \"snapshot_id\": result.snapshot_id,\n", + " \"cost\": result.cost,\n", + " \"row_count\": result.row_count,\n", + " \"data\": result.data,\n", + " \"error\": result.error,\n", + "}\n", + "\n", + "with open(output_file, \"w\") as f:\n", + " json.dump(export_data, f, indent=2, default=str)\n", + "\n", + "print(f\"Exported to: {output_file}\")\n", + "print(f\"\\nData type: {type(result.data)}\")\n", + "print(f\"Data preview: {str(result.data)[:500]}...\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Raw Data ===\n", + "Type: \n", + "Data: {'url': 'https://www.instagram.com/p/DTGAZJQkg5k/', 'user_posted': 'harrypotter', 'description': 'time for us to embrace the inevitable πŸ˜”', 'num_comments': 60, 'date_posted': '2026-01-04T15:30:16.000Z', 'likes': 40470, 'photos': ['https://scontent-bos5-1.cdninstagram.com/v/t51.2885-15/610176878_18558585163030699_8176807390950763652_n.jpg?stp=dst-jpg_e15_fr_p1080x1080_tt6&_nc_ht=scontent-bos5-1.cdninstagram.com&_nc_cat=107&_nc_oc=Q6cZ2QEkh4dVLXfb9wB7SW8tba5SXhPxPoYVePY5t47wEhrB5SC7fSnvYIs9cFBvJETQLtA&_nc_ohc=m4clcQGltTAQ7kNvwGtd33G&_nc_gid=d0unG-mnNeH_KJhc3ChoZQ&edm=ANTKIIoBAAAA&ccb=7-5&oh=00_Afqd-Dygr6WTcuJGtH7X5axfpWO_O_XJAXRk6HBg7Kb-VA&oe=696E87CF&_nc_sid=d885a2'], 'latest_comments': [{'comments': 'πŸ˜‚πŸ˜‚πŸ˜‚', 'user_commenting': 'chees_yrat', 'likes': 0, 'profile_picture': 'https://scontent-bos5-1.cdninstagram.com/v/t51.2885-19/612565466_17874854847470299_5203377400103807929_n.jpg?stp=dst-jpg_s150x150_tt6&efg=eyJ2ZW5jb2RlX3RhZyI6InByb2ZpbGVfcGljLmRqYW5nby4xMDgwLmMyIn0&_nc_ht=scontent-bos5-1.cdninstagram.com&_nc_cat=107&_nc_oc=Q6cZ2QEkh4dVLXfb9wB7SW8tba5SXhPxPoYVePY5t47wEhrB5SC7fSnvYIs9cFBvJETQLtA&_nc_ohc=wBIljjfa1JcQ7kNvwHU8RiQ&_nc_gid=d0unG-mnNeH_KJhc3ChoZQ&edm=ANTKIIoBAAAA&ccb=7-5&oh=00_AfoCXbIGnhzeLyxNQvh68Ib0_GWyQqoivWDlSrq1y7od-A&oe=696EA450&_nc_sid=d885a2'}, {'comments': 'yeah really', 'user_commenting': 'eva._cruise._', 'likes': 0, 'profile_picture': 'https://scontent-bos5-1.cdninstagram.com/v/t51.2885-19/551998872_17845573161572472_3522370076722414659_n.jpg?stp=dst-jpg_s150x150_tt6&efg=eyJ2ZW5jb2RlX3RhZyI6InByb2ZpbGVfcGljLmRqYW5nby4xMDgwLmMyIn0&_nc_ht=scontent-bos5-1.cdninstagram.com&_nc_cat=100&_nc_oc=Q6cZ2QEkh4dVLXfb9wB7SW8tba5SXhPxPoYVePY5t47wEhrB5SC7fSnvYIs9cFBvJETQLtA&_nc_ohc=m_UEMNB5Y5MQ7kNvwFA9zPy&_nc_gid=d0unG-mnNeH_KJhc3ChoZQ&edm=ANTKIIoBAAAA&ccb=7-5&oh=00_AfpETC2hGcAwdzlmlHZI-9-iENhW2rHjbkFA7KC2PXgopg&oe=696E7777&_nc_sid=d885a2'}, {'comments': \"You didn't ever see Iran. Did u?\", 'user_commenting': 'alianeo17', 'likes': 0, 'profile_picture': 'https://scontent-bos5-1.cdninstagram.com/v/t51.2885-19/610678132_17851319829607166_7990429666565212247_n.jpg?stp=dst-jpg_s150x150_tt6&efg=eyJ2ZW5jb2RlX3RhZyI6InByb2ZpbGVfcGljLmRqYW5nby4xMDgwLmMyIn0&_nc_ht=scontent-bos5-1.cdninstagram.com&_nc_cat=102&_nc_oc=Q6cZ2QEkh4dVLXfb9wB7SW8tba5SXhPxPoYVePY5t47wEhrB5SC7fSnvYIs9cFBvJETQLtA&_nc_ohc=W54KcVJz4lwQ7kNvwHJNtAe&_nc_gid=d0unG-mnNeH_KJhc3ChoZQ&edm=ANTKIIoBAAAA&ccb=7-5&oh=00_AfpQODe0F8HgiY-t0Hs85QMOGYaINxRgxEMfXpthrKx7iw&oe=696E7BF4&_nc_sid=d885a2'}, {'comments': 'πŸ™ŒπŸ€™', 'user_commenting': 'sri_ram_3821', 'likes': 0, 'profile_picture': 'https://scontent-bos5-1.cdninstagram.com/v/t51.2885-19/606911498_17978799539967190_7193831612709594702_n.jpg?stp=dst-jpg_s150x150_tt6&efg=eyJ2ZW5jb2RlX3RhZyI6InByb2ZpbGVfcGljLmRqYW5nby4xMDgwLmMyIn0&_nc_ht=scontent-bos5-1.cdninstagram.com&_nc_cat=110&_nc_oc=Q6cZ2QEkh4dVLXfb9wB7SW8tba5SXhPxPoYVePY5t47wEhrB5SC7fSnvYIs9cFBvJETQLtA&_nc_ohc=8ZQgemSrQNEQ7kNvwH85jgJ&_nc_gid=d0unG-mnNeH_KJhc3ChoZQ&edm=ANTKIIoBAAAA&ccb=7-5&oh=00_AfrvtIsP5RosO7SjyybjZ7LA_kxT5hU-UX_yTsRO0Y2YAg&oe=696E79F0&_nc_sid=d885a2'}, {'comments': 'Soo true πŸ˜‚πŸ‘', 'user_commenting': 'ewren260115', 'likes': 0, 'profile_picture': 'https://scontent-bos5-1.cdninstagram.com/v/t51.2885-19/587502453_17891215116401886_6220944050267680102_n.jpg?stp=dst-jpg_s150x150_tt6&efg=eyJ2ZW5jb2RlX3RhZyI6InByb2ZpbGVfcGljLmRqYW5nby42NzYuYzIifQ&_nc_ht=scontent-bos5-1.cdninstagram.com&_nc_cat=106&_nc_oc=Q6cZ2QEkh4dVLXfb9wB7SW8tba5SXhPxPoYVePY5t47wEhrB5SC7fSnvYIs9cFBvJETQLtA&_nc_ohc=NbY1tjjh1WoQ7kNvwFrQUpX&_nc_gid=d0unG-mnNeH_KJhc3ChoZQ&edm=ANTKIIoBAAAA&ccb=7-5&oh=00_AfqjNx33f1ollPvvjZbGNA3Uya7jZuPnbeBAs9ol2rnSXA&oe=696E9D73&_nc_sid=d885a2'}], 'post_id': '3802728663289564772', 'shortcode': 'DTGAZJQkg5k', 'content_type': 'Image', 'pk': '3802728663289564772', 'content_id': 'DTGAZJQkg5k', 'thumbnail': 'https://scontent-bos5-1.cdninstagram.com/v/t51.2885-15/610176878_18558585163030699_8176807390950763652_n.jpg?stp=c0.135.1080.1080a_dst-jpg_e35_s640x640_sh0.08_tt6&_nc_ht=scontent-bos5-1.cdninstagram.com&_nc_cat=107&_nc_oc=Q6cZ2QEkh4dVLXfb9wB7SW8tba5SXhPxPoYVePY5t47wEhrB5SC7fSnvYIs9cFBvJETQLtA&_nc_ohc=m4clcQGltTAQ7kNvwGtd33G&_nc_gid=d0unG-mnNeH_KJhc3ChoZQ&edm=ANTKIIoBAAAA&ccb=7-5&oh=00_AfqCVFvGJGSi9D060Kruf5PwxbuhsmDzDc8AOFX1RFNZuw&oe=696E87CF&_nc_sid=d885a2', 'followers': 12993751, 'posts_count': 5892, 'profile_image_link': 'https://scontent-bos5-1.cdninstagram.com/v/t51.2885-19/609185349_18557988352030699_1716891283191165075_n.jpg?stp=dst-jpg_s150x150_tt6&efg=eyJ2ZW5jb2RlX3RhZyI6InByb2ZpbGVfcGljLmRqYW5nby4xMDgwLmMyIn0&_nc_ht=scontent-bos5-1.cdninstagram.com&_nc_cat=1&_nc_oc=Q6cZ2QEkh4dVLXfb9wB7SW8tba5SXhPxPoYVePY5t47wEhrB5SC7fSnvYIs9cFBvJETQLtA&_nc_ohc=UeSbyvI6ApsQ7kNvwGORVSk&_nc_gid=d0unG-mnNeH_KJhc3ChoZQ&edm=ANTKIIoBAAAA&ccb=7-5&oh=00_AfrAb-M1LKZCM9WKtz6mbkVQAouxoCZanPd9x7EJT8O0dQ&oe=696EA279&_nc_sid=d885a2', 'is_verified': True, 'is_paid_partnership': False, 'partnership_details': None, 'user_posted_id': '1315934698', 'post_content': [{'index': 0, 'type': 'Photo', 'url': 'https://scontent-bos5-1.cdninstagram.com/v/t51.2885-15/610176878_18558585163030699_8176807390950763652_n.jpg?stp=dst-jpg_e15_fr_p1080x1080_tt6&_nc_ht=scontent-bos5-1.cdninstagram.com&_nc_cat=107&_nc_oc=Q6cZ2QEkh4dVLXfb9wB7SW8tba5SXhPxPoYVePY5t47wEhrB5SC7fSnvYIs9cFBvJETQLtA&_nc_ohc=m4clcQGltTAQ7kNvwGtd33G&_nc_gid=d0unG-mnNeH_KJhc3ChoZQ&edm=ANTKIIoBAAAA&ccb=7-5&oh=00_Afqd-Dygr6WTcuJGtH7X5axfpWO_O_XJAXRk6HBg7Kb-VA&oe=696E87CF&_nc_sid=d885a2', 'id': '3802728663289564772', 'alt_text': \"Photo by Harry Potter on January 04, 2026. May be a meme of text that says 'failing my new year's resolutions already 2025 2025me me 2026 2026me me'.\"}], 'audio': None, 'profile_url': 'https://www.instagram.com/harrypotter', 'videos_duration': None, 'images': [], 'alt_text': \"Photo by Harry Potter on January 04, 2026. May be a meme of text that says 'failing my new year's resolutions already 2025 2025me me 2026 2026me me'.\", 'photos_number': 0, 'timestamp': '2026-01-15T09:57:02.671Z', 'input': {'url': 'https://www.instagram.com/p/DTGAZJQkg5k/'}}\n" + ] + } + ], + "source": [ + "# Debug: See actual data structure\n", + "print(\"=== Raw Data ===\")\n", + "print(f\"Type: {type(result.data)}\")\n", + "print(f\"Data: {result.data}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "post_img_link_with_carausel =\"https://www.instagram.com/p/DTLo9uhDPCn/\"\n", + "post_img_link=\"https://www.instagram.com/p/DTGAZJQkg5k/\"\n", + "reel_link=\"https://www.instagram.com/reel/DTQygzxD6QC/\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Scraping post: https://www.instagram.com/p/DTGAZJQkg5k/\n", + "This may take 1-3 minutes...\n", + "\n", + "Success: True\n", + "Status: ready\n", + "Snapshot ID: sd_mkfabhyt24w4dmf365\n", + "Cost: $0.0020\n", + "\n", + "--- Post Data ---\n", + "Post ID: 3802728663289564772\n", + "Shortcode: DTGAZJQkg5k\n", + "User: harrypotter\n", + "Description: time for us to embrace the inevitable πŸ˜”...\n", + "Likes: 40471\n", + "Comments: 60\n", + "Posted at: 2026-01-04T15:30:16.000Z\n", + "Content Type: Image\n" + ] + } + ], + "source": [ + "# Test post extraction by URL\n", + "\n", + "POST_URL = post_img_link\n", + "\n", + "print(f\"Scraping post: {POST_URL}\")\n", + "print(\"This may take 1-3 minutes...\\n\")\n", + "\n", + "async with client.scrape.instagram.engine:\n", + " result = await client.scrape.instagram.posts(url=POST_URL, timeout=180)\n", + "\n", + "print(f\"Success: {result.success}\")\n", + "print(f\"Status: {result.status}\")\n", + "print(f\"Snapshot ID: {result.snapshot_id}\")\n", + "print(f\"Cost: ${result.cost:.4f}\" if result.cost else \"Cost: N/A\")\n", + "\n", + "if result.success and result.data:\n", + " print(\"\\n--- Post Data ---\")\n", + " data = result.data\n", + " # Correct API field names\n", + " print(f\"Post ID: {data.get('post_id', 'N/A')}\")\n", + " print(f\"Shortcode: {data.get('shortcode', 'N/A')}\")\n", + " print(f\"User: {data.get('user_posted', 'N/A')}\")\n", + " description = str(data.get('description', 'N/A') or 'N/A')\n", + " print(f\"Description: {description[:100]}...\")\n", + " print(f\"Likes: {data.get('likes', 'N/A')}\")\n", + " print(f\"Comments: {data.get('num_comments', 'N/A')}\")\n", + " print(f\"Posted at: {data.get('date_posted', 'N/A')}\")\n", + " print(f\"Content Type: {data.get('content_type', 'N/A')}\")\n", + "else:\n", + " print(f\"\\nError: {result.error}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.3 Reels - Extract reel by URL" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Scraping reel: https://www.instagram.com/reel/DTQygzxD6QC/\n", + "This may take 1-3 minutes...\n", + "\n", + "Success: True\n", + "Status: ready\n", + "Snapshot ID: sd_mkfag4wn1g4ijaer1b\n", + "Cost: $0.0020\n", + "\n", + "--- Reel Data ---\n", + "Available keys: ['url', 'user_posted', 'description', 'hashtags', 'num_comments', 'date_posted', 'likes', 'views', 'video_play_count', 'top_comments', 'post_id', 'thumbnail', 'shortcode', 'content_id', 'product_type', 'coauthor_producers', 'tagged_users', 'length', 'video_url', 'audio_url', 'posts_count', 'followers', 'following', 'user_profile_url', 'is_paid_partnership', 'partnership_details', 'is_verified', 'profile_image_link', 'timestamp', 'input']\n", + "\n", + "Post ID: 3805763842060821506_1315934698\n", + "Shortcode: DTQygzxD6QC\n", + "User: harrypotter\n", + "Description: physically, we're at our desks, mentally, we're sitting in front of a cosy fire in our knitted jumpe...\n", + "Likes: 41422\n", + "Comments: 156\n", + "Views: 314279\n", + "Play Count: 1079845\n", + "Length: 9.515 seconds\n", + "Product Type: clips\n", + "Video URL: https://scontent-lax3-2.cdninstagram.com/o1/v/t2/f2/m86/AQOILcfjU2DKCJYTfjWnXrRK...\n" + ] + } + ], + "source": [ + "# Test reel extraction by URL\n", + "REEL_URL = reel_link\n", + "\n", + "print(f\"Scraping reel: {REEL_URL}\")\n", + "print(\"This may take 1-3 minutes...\\n\")\n", + "\n", + "async with client.scrape.instagram.engine:\n", + " result = await client.scrape.instagram.reels(url=REEL_URL, timeout=180)\n", + "\n", + "print(f\"Success: {result.success}\")\n", + "print(f\"Status: {result.status}\")\n", + "print(f\"Snapshot ID: {result.snapshot_id}\")\n", + "print(f\"Cost: ${result.cost:.4f}\" if result.cost else \"Cost: N/A\")\n", + "\n", + "if result.success and result.data:\n", + " print(\"\\n--- Reel Data ---\")\n", + " data = result.data\n", + " # Show available keys first\n", + " print(f\"Available keys: {list(data.keys()) if isinstance(data, dict) else 'N/A'}\")\n", + " # Correct API field names for reels\n", + " print(f\"\\nPost ID: {data.get('post_id', 'N/A')}\")\n", + " print(f\"Shortcode: {data.get('shortcode', 'N/A')}\")\n", + " print(f\"User: {data.get('user_posted', 'N/A')}\")\n", + " description = str(data.get('description', 'N/A') or 'N/A')\n", + " print(f\"Description: {description[:100]}...\")\n", + " print(f\"Likes: {data.get('likes', 'N/A')}\")\n", + " print(f\"Comments: {data.get('num_comments', 'N/A')}\")\n", + " print(f\"Views: {data.get('views', 'N/A')}\")\n", + " print(f\"Play Count: {data.get('video_play_count', 'N/A')}\")\n", + " print(f\"Length: {data.get('length', 'N/A')} seconds\")\n", + " print(f\"Product Type: {data.get('product_type', 'N/A')}\")\n", + " print(f\"Video URL: {data.get('video_url', 'N/A')[:80]}...\" if data.get('video_url') else \"Video URL: N/A\")\n", + "else:\n", + " print(f\"\\nError: {result.error}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Video saved to: /Users/ns/Desktop/projects/sdk-python/notebooks/instagram_reel_DTQygzxD6QC.mp4\n" + ] + } + ], + "source": [ + "# Quick video download\n", + "import aiohttp\n", + "from pathlib import Path\n", + "\n", + "async def download_video(url: str, filename: str) -> str:\n", + " \"\"\"Download video from URL.\"\"\"\n", + " output_path = Path.cwd() / filename\n", + " async with aiohttp.ClientSession() as session:\n", + " async with session.get(url) as response:\n", + " if response.status == 200:\n", + " with open(output_path, 'wb') as f:\n", + " f.write(await response.read())\n", + " return str(output_path)\n", + " else:\n", + " return f\"Error: {response.status}\"\n", + "\n", + "# Download the reel video\n", + "if result.success and result.data and result.data.get('video_url'):\n", + " video_url = result.data['video_url']\n", + " shortcode = result.data.get('shortcode', 'reel')\n", + " filename = f\"instagram_reel_{shortcode}.mp4\"\n", + " \n", + " saved_path = await download_video(video_url, filename)\n", + " print(f\"Video saved to: {saved_path}\")\n", + "else:\n", + " print(\"No video URL available\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.4 Comments - Extract comments by post URL" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Scraping comments from: https://www.instagram.com/p/DTGAZJQkg5k/\n", + "This may take 1-3 minutes...\n", + "\n", + "Success: True\n", + "Status: ready\n", + "Snapshot ID: sd_mkfb04rs2nlefcer20\n", + "Cost: $0.0300\n", + "\n", + "--- Comments Data ---\n", + "Number of comments: 15\n", + "Available keys: ['url', 'comment_user', 'comment_user_url', 'comment_date', 'comment', 'likes_number', 'replies_number', 'post_url', 'post_user', 'comment_id', 'post_id', 'timestamp', 'input']\n", + "\n", + "Comment 1:\n", + " User: annisaameilani.a\n", + " User URL: https://www.instagram.com/annisaameilani.a\n", + " Text: πŸ˜‚πŸ˜‚πŸ˜‚...\n", + " Likes: 0\n", + " Replies: 0\n", + " Date: 2026-01-04T22:23:15.000Z\n", + "\n", + "Comment 2:\n", + " User: swaywarm\n", + " User URL: https://www.instagram.com/swaywarm\n", + " Text: πŸ’”...\n", + " Likes: 0\n", + " Replies: 0\n", + " Date: 2026-01-04T21:03:49.000Z\n", + "\n", + "Comment 3:\n", + " User: manue_arsenault_\n", + " User URL: https://www.instagram.com/manue_arsenault_\n", + " Text: πŸ˜‚...\n", + " Likes: 0\n", + " Replies: 0\n", + " Date: 2026-01-04T19:39:56.000Z\n", + "\n", + "Comment 4:\n", + " User: michileobw\n", + " User URL: https://www.instagram.com/michileobw\n", + " Text: ❀️❀️❀️...\n", + " Likes: 0\n", + " Replies: 0\n", + " Date: 2026-01-04T16:50:41.000Z\n", + "\n", + "Comment 5:\n", + " User: aboodmx30\n", + " User URL: https://www.instagram.com/aboodmx30\n", + " Text: πŸ˜‚πŸ˜‚...\n", + " Likes: 0\n", + " Replies: 0\n", + " Date: 2026-01-04T17:27:44.000Z\n" + ] + } + ], + "source": [ + "# Test comments extraction by URL\n", + "COMMENTS_URL = post_img_link # Use the post we already tested\n", + "\n", + "print(f\"Scraping comments from: {COMMENTS_URL}\")\n", + "print(\"This may take 1-3 minutes...\\n\")\n", + "\n", + "async with client.scrape.instagram.engine:\n", + " result = await client.scrape.instagram.comments(url=COMMENTS_URL, timeout=180)\n", + "\n", + "print(f\"Success: {result.success}\")\n", + "print(f\"Status: {result.status}\")\n", + "print(f\"Snapshot ID: {result.snapshot_id}\")\n", + "print(f\"Cost: ${result.cost:.4f}\" if result.cost else \"Cost: N/A\")\n", + "\n", + "if result.success and result.data:\n", + " print(\"\\n--- Comments Data ---\")\n", + " data = result.data\n", + " if isinstance(data, list):\n", + " print(f\"Number of comments: {len(data)}\")\n", + " if len(data) > 0:\n", + " print(f\"Available keys: {list(data[0].keys())}\")\n", + " for i, comment in enumerate(data[:5]):\n", + " print(f\"\\nComment {i+1}:\")\n", + " # Correct API field names for comments endpoint\n", + " print(f\" User: {comment.get('comment_user', 'N/A')}\")\n", + " print(f\" User URL: {comment.get('comment_user_url', 'N/A')}\")\n", + " text = str(comment.get('comment', 'N/A'))\n", + " print(f\" Text: {text[:80]}...\")\n", + " print(f\" Likes: {comment.get('likes_number', 'N/A')}\")\n", + " print(f\" Replies: {comment.get('replies_number', 'N/A')}\")\n", + " print(f\" Date: {comment.get('comment_date', 'N/A')}\")\n", + " elif isinstance(data, dict):\n", + " print(f\"Available keys: {list(data.keys())}\")\n", + " print(f\"Data: {data}\")\n", + " else:\n", + " print(f\"Data type: {type(data)}\")\n", + "else:\n", + " print(f\"\\nNo data returned. Debug info:\")\n", + " print(f\" result.data: {result.data}\")\n", + " print(f\" result.row_count: {result.row_count}\")\n", + " print(f\" result.error: {result.error}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "# Part 2: InstagramSearchScraper (Discovery with extra_params)\n", + "\n", + "Test parameter-based discovery methods that use `extra_params` for:\n", + "- `type=discover_new`\n", + "- `discover_by=user_name|url|url_all_reels`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.1 Profiles Discovery - by username" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test profile discovery by username\n", + "# Uses: extra_params={\"type\": \"discover_new\", \"discover_by\": \"user_name\"}\n", + "USERNAME = \"nasa\"\n", + "\n", + "print(f\"Discovering profile for username: {USERNAME}\")\n", + "print(\"Using extra_params: type=discover_new, discover_by=user_name\")\n", + "print(\"This may take 1-3 minutes...\\n\")\n", + "\n", + "async with client.search.instagram.engine:\n", + " result = await client.search.instagram.profiles(user_name=USERNAME, timeout=180)\n", + "\n", + "print(f\"Success: {result.success}\")\n", + "print(f\"Status: {result.status}\")\n", + "print(f\"Snapshot ID: {result.snapshot_id}\")\n", + "print(f\"Cost: ${result.cost:.4f}\" if result.cost else \"Cost: N/A\")\n", + "\n", + "if result.success and result.data:\n", + " print(\"\\n--- Discovered Profile ---\")\n", + " data = result.data\n", + " # Show data structure first\n", + " if isinstance(data, list) and len(data) > 0:\n", + " data = data[0]\n", + " \n", + " print(f\"Available keys: {list(data.keys()) if isinstance(data, dict) else 'N/A'}\")\n", + " \n", + " # Correct API field names for profile discovery\n", + " print(f\"\\nAccount: {data.get('account', 'N/A')}\")\n", + " print(f\"Full Name: {data.get('full_name', 'N/A')}\")\n", + " print(f\"Profile Name: {data.get('profile_name', 'N/A')}\")\n", + " print(f\"ID: {data.get('id', 'N/A')}\")\n", + " print(f\"Followers: {data.get('followers', 'N/A'):,}\" if isinstance(data.get('followers'), int) else f\"Followers: {data.get('followers', 'N/A')}\")\n", + " print(f\"Following: {data.get('following', 'N/A')}\")\n", + " print(f\"Posts: {data.get('posts_count', 'N/A')}\")\n", + " print(f\"Is Verified: {data.get('is_verified', 'N/A')}\")\n", + " print(f\"Is Business: {data.get('is_business_account', 'N/A')}\")\n", + " print(f\"Category: {data.get('category_name', 'N/A')}\")\n", + " bio = str(data.get('biography', 'N/A') or 'N/A')\n", + " print(f\"Bio: {bio[:100]}...\")\n", + " print(f\"Profile URL: {data.get('profile_url', 'N/A')}\")\n", + " print(f\"External URL: {data.get('external_url', 'N/A')}\")\n", + "else:\n", + " print(f\"\\nError: {result.error}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.2 Posts Discovery - by profile URL with filters" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Discovering posts from profile: https://www.instagram.com/nasa/\n", + "Using extra_params: type=discover_new, discover_by=url\n", + "Filters: num_of_posts=5\n", + "This may take 1-3 minutes...\n", + "\n", + "Success: True\n", + "Status: ready\n", + "Snapshot ID: sd_mkfb7xwn29nxxarxe\n", + "Cost: $0.0100\n", + "\n", + "--- Discovered Posts ---\n", + "Number of posts discovered: 5\n", + "Available keys: ['url', 'user_posted', 'description', 'hashtags', 'num_comments', 'date_posted', 'likes', 'photos', 'latest_comments', 'post_id', 'discovery_input', 'shortcode', 'content_type', 'pk', 'content_id', 'thumbnail', 'followers', 'posts_count', 'profile_image_link', 'is_verified', 'is_paid_partnership', 'partnership_details', 'user_posted_id', 'post_content', 'audio', 'profile_url', 'videos_duration', 'images', 'alt_text', 'photos_number', 'timestamp', 'input']\n", + "\n", + "Post 1:\n", + " URL: https://www.instagram.com/p/DTTInqQliCG\n", + " Post ID: 3806424019865313414\n", + " Description: Soaring through stormy Sagittarius seas\n", + "\n", + "The Lagoon Nebula, ...\n", + " Likes: 398349\n", + " Comments: 1888\n", + "\n", + "Post 2:\n", + " URL: https://www.instagram.com/reel/DTQXTbvjp4y\n", + " Post ID: 3805644175656787506\n", + " Description: Catch Jupiter looking like the year aheadβ€”big and bright 😎\n", + " ...\n", + " Likes: 200641\n", + " Comments: 819\n", + "\n", + "Post 3:\n", + " URL: https://www.instagram.com/p/DTazwH-E60i\n", + " Post ID: 3808584042359794978\n", + " Description: Yeah, the view is gnarly πŸ”­β£\n", + "\n", + "This showstopping spiral galaxy...\n", + " Likes: 423621\n", + " Comments: 1119\n" + ] + } + ], + "source": [ + "# Test posts discovery from profile URL\n", + "# Uses: extra_params={\"type\": \"discover_new\", \"discover_by\": \"url\"}\n", + "PROFILE_URL = \"https://www.instagram.com/nasa/\"\n", + "\n", + "print(f\"Discovering posts from profile: {PROFILE_URL}\")\n", + "print(\"Using extra_params: type=discover_new, discover_by=url\")\n", + "print(\"Filters: num_of_posts=5\")\n", + "print(\"This may take 1-3 minutes...\\n\")\n", + "\n", + "async with client.search.instagram.engine:\n", + " result = await client.search.instagram.posts(\n", + " url=PROFILE_URL,\n", + " num_of_posts=5,\n", + " timeout=180\n", + " )\n", + "\n", + "print(f\"Success: {result.success}\")\n", + "print(f\"Status: {result.status}\")\n", + "print(f\"Snapshot ID: {result.snapshot_id}\")\n", + "print(f\"Cost: ${result.cost:.4f}\" if result.cost else \"Cost: N/A\")\n", + "\n", + "if result.success and result.data:\n", + " print(\"\\n--- Discovered Posts ---\")\n", + " data = result.data\n", + " if isinstance(data, list):\n", + " print(f\"Number of posts discovered: {len(data)}\")\n", + " if len(data) > 0:\n", + " print(f\"Available keys: {list(data[0].keys()) if isinstance(data[0], dict) else 'N/A'}\")\n", + " for i, post in enumerate(data[:3]):\n", + " print(f\"\\nPost {i+1}:\")\n", + " print(f\" URL: {post.get('url', 'N/A')}\")\n", + " print(f\" Post ID: {post.get('post_id', 'N/A')}\")\n", + " description = str(post.get('description', 'N/A') or 'N/A')\n", + " print(f\" Description: {description[:60]}...\")\n", + " print(f\" Likes: {post.get('likes', 'N/A')}\")\n", + " print(f\" Comments: {post.get('num_comments', 'N/A')}\")\n", + " elif isinstance(data, dict):\n", + " print(f\"Available keys: {list(data.keys())}\")\n", + " print(f\"Post ID: {data.get('post_id', 'N/A')}\")\n", + " description = str(data.get('description', 'N/A') or 'N/A')\n", + " print(f\"Description: {description[:60]}...\")\n", + " else:\n", + " print(f\"Data type: {type(data)}\")\n", + "else:\n", + " print(f\"\\nError: {result.error}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.3 Reels Discovery - by profile URL" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Discovering reels from profile: https://www.instagram.com/espn/\n", + "Using extra_params: type=discover_new, discover_by=url\n", + "Filters: num_of_posts=5\n", + "This may take 1-3 minutes...\n", + "\n", + "Success: True\n", + "Status: ready\n", + "Snapshot ID: sd_mkfb8dy72d6vsz89c6\n", + "Cost: $0.0100\n", + "\n", + "--- Discovered Reels ---\n", + "Number of reels discovered: 5\n", + "Available keys: ['url', 'user_posted', 'description', 'hashtags', 'num_comments', 'date_posted', 'likes', 'views', 'video_play_count', 'top_comments', 'post_id', 'thumbnail', 'shortcode', 'content_id', 'product_type', 'coauthor_producers', 'tagged_users', 'length', 'video_url', 'audio_url', 'posts_count', 'followers', 'following', 'user_profile_url', 'is_paid_partnership', 'is_verified', 'profile_image_link', 'timestamp', 'input', 'discovery_input']\n", + "\n", + "Reel 1:\n", + " URL: https://www.instagram.com/p/DStw-S5Dgmt/\n", + " Post ID: 3795905448067860909_505182045\n", + " User: sportscenter\n", + " Likes: 305570\n", + " Comments: 1878\n", + "\n", + "Reel 2:\n", + " URL: https://www.instagram.com/p/DTgR7XfDOLy/\n", + " Post ID: 3810124131212255986_1320207\n", + " User: espn\n", + " Likes: 9097\n", + " Comments: 42\n", + "\n", + "Reel 3:\n", + " URL: https://www.instagram.com/p/DThJaRvAKgV/\n", + " Post ID: 3810368147899328533_1320207\n", + " User: espn\n", + " Likes: 21291\n", + " Comments: 131\n" + ] + } + ], + "source": [ + "# Test reels discovery from profile URL\n", + "# Uses: extra_params={\"type\": \"discover_new\", \"discover_by\": \"url\"}\n", + "PROFILE_URL = \"https://www.instagram.com/espn/\"\n", + "\n", + "print(f\"Discovering reels from profile: {PROFILE_URL}\")\n", + "print(\"Using extra_params: type=discover_new, discover_by=url\")\n", + "print(\"Filters: num_of_posts=5\")\n", + "print(\"This may take 1-3 minutes...\\n\")\n", + "\n", + "async with client.search.instagram.engine:\n", + " result = await client.search.instagram.reels(\n", + " url=PROFILE_URL,\n", + " num_of_posts=5,\n", + " timeout=180\n", + " )\n", + "\n", + "print(f\"Success: {result.success}\")\n", + "print(f\"Status: {result.status}\")\n", + "print(f\"Snapshot ID: {result.snapshot_id}\")\n", + "print(f\"Cost: ${result.cost:.4f}\" if result.cost else \"Cost: N/A\")\n", + "\n", + "if result.success and result.data:\n", + " print(\"\\n--- Discovered Reels ---\")\n", + " data = result.data\n", + " if isinstance(data, list):\n", + " print(f\"Number of reels discovered: {len(data)}\")\n", + " if len(data) > 0:\n", + " print(f\"Available keys: {list(data[0].keys()) if isinstance(data[0], dict) else 'N/A'}\")\n", + " for i, reel in enumerate(data[:3]):\n", + " print(f\"\\nReel {i+1}:\")\n", + " print(f\" URL: {reel.get('url', 'N/A')}\")\n", + " print(f\" Post ID: {reel.get('post_id', 'N/A')}\")\n", + " print(f\" User: {reel.get('user_posted', 'N/A')}\")\n", + " print(f\" Likes: {reel.get('likes', 'N/A')}\")\n", + " print(f\" Comments: {reel.get('num_comments', 'N/A')}\")\n", + " elif isinstance(data, dict):\n", + " print(f\"Available keys: {list(data.keys())}\")\n", + " else:\n", + " print(f\"Data type: {type(data)}\")\n", + "else:\n", + " print(f\"\\nError: {result.error}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.4 Reels All Discovery - by profile URL" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Discovering ALL reels from profile: https://www.instagram.com/billieeilish/\n", + "Using extra_params: type=discover_new, discover_by=url_all_reels\n", + "Filters: num_of_posts=10\n", + "This may take 1-3 minutes...\n", + "\n", + "Success: False\n", + "Status: timeout\n", + "Snapshot ID: sd_mkfbpfvl12xas6bvz2\n", + "Cost: N/A\n", + "\n", + "Error: Polling timeout after 180s\n" + ] + } + ], + "source": [ + "# Test reels_all discovery from profile URL\n", + "# Uses: extra_params={\"type\": \"discover_new\", \"discover_by\": \"url_all_reels\"}\n", + "PROFILE_URL = \"https://www.instagram.com/billieeilish/\"\n", + "\n", + "print(f\"Discovering ALL reels from profile: {PROFILE_URL}\")\n", + "print(\"Using extra_params: type=discover_new, discover_by=url_all_reels\")\n", + "print(\"Filters: num_of_posts=10\")\n", + "print(\"This may take 1-3 minutes...\\n\")\n", + "\n", + "async with client.search.instagram.engine:\n", + " result = await client.search.instagram.reels_all(\n", + " url=PROFILE_URL,\n", + " num_of_posts=10,\n", + " timeout=180\n", + " )\n", + "\n", + "print(f\"Success: {result.success}\")\n", + "print(f\"Status: {result.status}\")\n", + "print(f\"Snapshot ID: {result.snapshot_id}\")\n", + "print(f\"Cost: ${result.cost:.4f}\" if result.cost else \"Cost: N/A\")\n", + "\n", + "if result.success and result.data:\n", + " print(\"\\n--- Discovered All Reels ---\")\n", + " data = result.data\n", + " if isinstance(data, list):\n", + " print(f\"Number of reels discovered: {len(data)}\")\n", + " else:\n", + " print(f\"Data type: {type(data)}\")\n", + "else:\n", + " print(f\"\\nError: {result.error}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "# Part 3: Verify Timing Metadata" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check timing metadata from last result\n", + "print(\"=== Timing Metadata ===\")\n", + "print(f\"trigger_sent_at: {result.trigger_sent_at}\")\n", + "print(f\"snapshot_id_received_at: {result.snapshot_id_received_at}\")\n", + "print(f\"snapshot_polled_at: {result.snapshot_polled_at}\")\n", + "print(f\"data_fetched_at: {result.data_fetched_at}\")\n", + "print(f\"\\nrow_count: {result.row_count}\")\n", + "print(f\"cost: {result.cost}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "# Summary\n", + "\n", + "## InstagramScraper (URL-based)\n", + "- `profiles(url)` - Extract profile by URL\n", + "- `posts(url)` - Extract post by URL\n", + "- `reels(url)` - Extract reel by URL\n", + "- `comments(url)` - Extract comments by post URL\n", + "\n", + "## InstagramSearchScraper (Discovery with extra_params)\n", + "- `profiles(user_name)` - Discover by username (`discover_by=user_name`)\n", + "- `posts(url, ...)` - Discover posts with filters (`discover_by=url`)\n", + "- `reels(url, ...)` - Discover reels (`discover_by=url`)\n", + "- `reels_all(url, ...)` - Discover all reels (`discover_by=url_all_reels`)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/test_v2.1.0_release.ipynb b/notebooks/test_v2.1.0_release.ipynb new file mode 100644 index 0000000..563179f --- /dev/null +++ b/notebooks/test_v2.1.0_release.ipynb @@ -0,0 +1,497 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Bright Data SDK v2.1.0 Release Test\n", + "\n", + "This notebook verifies that the v2.1.0 release is working correctly." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Install the Package" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: brightdata-sdk==2.1.0 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (2.1.0)\n", + "Requirement already satisfied: aiohttp>=3.9.0 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (from brightdata-sdk==2.1.0) (3.13.2)\n", + "Requirement already satisfied: requests>=2.31.0 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (from brightdata-sdk==2.1.0) (2.32.5)\n", + "Requirement already satisfied: python-dotenv>=1.0.0 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (from brightdata-sdk==2.1.0) (1.2.1)\n", + "Requirement already satisfied: tldextract>=5.0.0 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (from brightdata-sdk==2.1.0) (5.3.0)\n", + "Requirement already satisfied: pydantic>=2.0.0 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (from brightdata-sdk==2.1.0) (2.12.5)\n", + "Requirement already satisfied: pydantic-settings>=2.0.0 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (from brightdata-sdk==2.1.0) (2.12.0)\n", + "Requirement already satisfied: aiolimiter>=1.1.0 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (from brightdata-sdk==2.1.0) (1.2.1)\n", + "Requirement already satisfied: click>=8.1.0 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (from brightdata-sdk==2.1.0) (8.3.1)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (from aiohttp>=3.9.0->brightdata-sdk==2.1.0) (2.6.1)\n", + "Requirement already satisfied: aiosignal>=1.4.0 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (from aiohttp>=3.9.0->brightdata-sdk==2.1.0) (1.4.0)\n", + "Requirement already satisfied: attrs>=17.3.0 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (from aiohttp>=3.9.0->brightdata-sdk==2.1.0) (25.4.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (from aiohttp>=3.9.0->brightdata-sdk==2.1.0) (1.8.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (from aiohttp>=3.9.0->brightdata-sdk==2.1.0) (6.7.0)\n", + "Requirement already satisfied: propcache>=0.2.0 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (from aiohttp>=3.9.0->brightdata-sdk==2.1.0) (0.4.1)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (from aiohttp>=3.9.0->brightdata-sdk==2.1.0) (1.22.0)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (from pydantic>=2.0.0->brightdata-sdk==2.1.0) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.41.5 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (from pydantic>=2.0.0->brightdata-sdk==2.1.0) (2.41.5)\n", + "Requirement already satisfied: typing-extensions>=4.14.1 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (from pydantic>=2.0.0->brightdata-sdk==2.1.0) (4.15.0)\n", + "Requirement already satisfied: typing-inspection>=0.4.2 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (from pydantic>=2.0.0->brightdata-sdk==2.1.0) (0.4.2)\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (from requests>=2.31.0->brightdata-sdk==2.1.0) (3.4.4)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (from requests>=2.31.0->brightdata-sdk==2.1.0) (3.11)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (from requests>=2.31.0->brightdata-sdk==2.1.0) (2.5.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (from requests>=2.31.0->brightdata-sdk==2.1.0) (2025.11.12)\n", + "Requirement already satisfied: requests-file>=1.4 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (from tldextract>=5.0.0->brightdata-sdk==2.1.0) (3.0.1)\n", + "Requirement already satisfied: filelock>=3.0.8 in /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages (from tldextract>=5.0.0->brightdata-sdk==2.1.0) (3.20.0)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.3\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" + ] + } + ], + "source": [ + "!pip install brightdata-sdk==2.1.0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Verify Version" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Installed version: 2.1.0\n", + "Version check passed!\n" + ] + } + ], + "source": [ + "import brightdata\n", + "print(f\"Installed version: {brightdata.__version__}\")\n", + "assert brightdata.__version__ == \"2.1.0\", f\"Expected 2.1.0, got {brightdata.__version__}\"\n", + "print(\"Version check passed!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Verify Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "All imports successful!\n" + ] + } + ], + "source": [ + "from brightdata import BrightDataClient, SyncBrightDataClient\n", + "from brightdata.api.async_unblocker import AsyncUnblockerClient\n", + "from brightdata.models import ScrapeResult, SearchResult\n", + "print(\"All imports successful!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Verify Client Structure" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "BrightDataClient structure verified!\n", + "SyncBrightDataClient available: True\n" + ] + } + ], + "source": [ + "# Check BrightDataClient has expected attributes\n", + "client_attrs = ['scrape', 'search', 'crawler', 'scrape_url']\n", + "for attr in client_attrs:\n", + " assert hasattr(BrightDataClient, attr) or attr in dir(BrightDataClient), f\"Missing: {attr}\"\n", + "print(\"BrightDataClient structure verified!\")\n", + "\n", + "# Check SyncBrightDataClient exists\n", + "print(f\"SyncBrightDataClient available: {SyncBrightDataClient is not None}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Test Async Mode Parameter (Structure Only)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WebUnlockerService.scrape parameters: ['self', 'url', 'zone', 'country', 'response_format', 'method', 'timeout', 'mode', 'poll_interval', 'poll_timeout']\n", + "WebUnlocker async mode parameter verified!\n" + ] + } + ], + "source": [ + "import inspect\n", + "from brightdata.api.web_unlocker import WebUnlockerService\n", + "\n", + "# Check WebUnlocker has mode parameter\n", + "sig = inspect.signature(WebUnlockerService.scrape)\n", + "params = list(sig.parameters.keys())\n", + "print(f\"WebUnlockerService.scrape parameters: {params}\")\n", + "assert 'mode' in params, \"Missing 'mode' parameter in WebUnlockerService.scrape\"\n", + "print(\"WebUnlocker async mode parameter verified!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Test AsyncUnblockerClient Structure" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AsyncUnblockerClient methods verified: ['trigger', 'get_status', 'fetch_result']\n" + ] + } + ], + "source": [ + "# Verify AsyncUnblockerClient has the expected methods\n", + "expected_methods = ['trigger', 'get_status', 'fetch_result']\n", + "for method in expected_methods:\n", + " assert hasattr(AsyncUnblockerClient, method), f\"Missing method: {method}\"\n", + "print(f\"AsyncUnblockerClient methods verified: {expected_methods}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Live Test (Requires API Token)\n", + "\n", + "Set your API token to run live tests:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Token found: 7011787d-2...d3336\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "# Set your token here or via environment variable\n", + "# os.environ[\"BRIGHTDATA_API_TOKEN\"] = \"your_token_here\"\n", + "\n", + "TOKEN = os.environ.get(\"BRIGHTDATA_API_TOKEN\")\n", + "if TOKEN:\n", + " print(f\"Token found: {TOKEN[:10]}...{TOKEN[-5:]}\")\n", + "else:\n", + " print(\"No token found. Set BRIGHTDATA_API_TOKEN to run live tests.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7.1 Test SERP Sync Mode" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SERP sync mode - Results: 7 items\n", + "First result: Welcome to Python.org\n" + ] + }, + { + "data": { + "text/plain": [ + " query={'q': 'python programming', 'location': None, 'lan...>" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import asyncio\n", + "\n", + "async def test_serp_sync():\n", + " if not TOKEN:\n", + " print(\"Skipping: No token\")\n", + " return\n", + " \n", + " async with BrightDataClient(token=TOKEN) as client:\n", + " result = await client.search.google(query=\"python programming\", num_results=5)\n", + " print(f\"SERP sync mode - Results: {len(result.data)} items\")\n", + " print(f\"First result: {result.data[0].get('title', 'N/A') if result.data else 'No data'}\")\n", + " return result\n", + "\n", + "await test_serp_sync()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7.2 Test SERP Async Mode" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing SERP async mode...\n", + "Result type: \n", + "Result attributes: ['cost', 'country', 'data', 'data_fetched_at', 'elapsed_ms', 'error', 'get_timing_breakdown', 'page', 'query', 'results_per_page', 'save_to_file', 'search_engine', 'success', 'to_dict', 'to_json', 'total_found', 'trigger_sent_at']\n", + "Result dict: {'success': False, 'cost': None, 'error': 'Polling timeout after 200s (response_id: s4w10t1767802054612rt2phhuenppg)', 'trigger_sent_at': datetime.datetime(2026, 1, 7, 16, 7, 34, 169001, tzinfo=datetime.timezone.utc), 'data_fetched_at': datetime.datetime(2026, 1, 7, 16, 10, 55, 588762, tzinfo=datetime.timezone.utc), 'query': {'q': 'python programming'}, 'data': None, 'total_found': None, 'search_engine': 'google', 'country': None, 'page': None, 'results_per_page': None}\n", + "SERP async mode - No data returned\n", + "Error: Polling timeout after 200s (response_id: s4w10t1767802054612rt2phhuenppg)\n" + ] + }, + { + "data": { + "text/plain": [ + " query={'q': 'python programming'}>" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "async def test_serp_async():\n", + " if not TOKEN:\n", + " print(\"Skipping: No token\")\n", + " return\n", + " \n", + " async with BrightDataClient(token=TOKEN) as client:\n", + " print(\"Testing SERP async mode...\")\n", + " result = await client.search.google(\n", + " query=\"python programming\",\n", + " mode=\"async\",\n", + " poll_interval=2,\n", + " poll_timeout=200\n", + " )\n", + " print(f\"Result type: {type(result)}\")\n", + " print(f\"Result attributes: {[a for a in dir(result) if not a.startswith('_')]}\")\n", + " print(f\"Result dict: {result.__dict__ if hasattr(result, '__dict__') else 'N/A'}\")\n", + " \n", + " if result and result.data:\n", + " print(f\"SERP async mode - Results: {len(result.data)} items\")\n", + " else:\n", + " print(f\"SERP async mode - No data returned\")\n", + " if hasattr(result, 'error'):\n", + " print(f\"Error: {result.error}\")\n", + " return result\n", + "\n", + "await test_serp_async()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7.3 Test Web Unlocker Sync Mode" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "async def test_web_unlocker_sync():\n", + " if not TOKEN:\n", + " print(\"Skipping: No token\")\n", + " return\n", + " \n", + " async with BrightDataClient(token=TOKEN) as client:\n", + " result = await client.scrape_url(url=\"https://example.com\")\n", + " print(f\"Web Unlocker sync mode - Status: {result.status}\")\n", + " print(f\"Data length: {len(result.data) if result.data else 0} chars\")\n", + " return result\n", + "\n", + "await test_web_unlocker_sync()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7.4 Test Web Unlocker Async Mode" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "async def test_web_unlocker_async():\n", + " if not TOKEN:\n", + " print(\"Skipping: No token\")\n", + " return\n", + " \n", + " async with BrightDataClient(token=TOKEN) as client:\n", + " print(\"Testing Web Unlocker async mode (may take ~2 minutes)...\")\n", + " result = await client.scrape_url(\n", + " url=\"https://example.com\",\n", + " mode=\"async\",\n", + " poll_interval=5,\n", + " poll_timeout=180\n", + " )\n", + " print(f\"Web Unlocker async mode - Status: {result.status}\")\n", + " print(f\"Data length: {len(result.data) if result.data else 0} chars\")\n", + " return result\n", + "\n", + "await test_web_unlocker_async()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7.5 Test SyncBrightDataClient" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def test_sync_client():\n", + " if not TOKEN:\n", + " print(\"Skipping: No token\")\n", + " return\n", + " \n", + " with SyncBrightDataClient(token=TOKEN) as client:\n", + " result = client.scrape_url(url=\"https://example.com\")\n", + " print(f\"SyncBrightDataClient - Status: {result.status}\")\n", + " print(f\"Data length: {len(result.data) if result.data else 0} chars\")\n", + " return result\n", + "\n", + "test_sync_client()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Summary" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"=\"*50)\n", + "print(\"Bright Data SDK v2.1.0 Release Test Summary\")\n", + "print(\"=\"*50)\n", + "print(f\"Version: {brightdata.__version__}\")\n", + "print(\"Imports: OK\")\n", + "print(\"Client structure: OK\")\n", + "print(\"Async mode parameter: OK\")\n", + "print(\"AsyncUnblockerClient: OK\")\n", + "if TOKEN:\n", + " print(\"Live tests: Completed (check results above)\")\n", + "else:\n", + " print(\"Live tests: Skipped (no token)\")\n", + "print(\"=\"*50)\n", + "print(\"Release verification complete!\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/pyproject.toml b/pyproject.toml index 023e432..da41a0f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ where = ["src"] [project] name = "brightdata-sdk" -version = "2.1.0" +version = "2.1.1" description = "Modern async-first Python SDK for Bright Data APIs" authors = [{name = "Bright Data", email = "support@brightdata.com"}] license = {text = "MIT"} diff --git a/src/brightdata/__init__.py b/src/brightdata/__init__.py index ecc02f8..244593b 100644 --- a/src/brightdata/__init__.py +++ b/src/brightdata/__init__.py @@ -1,6 +1,12 @@ """Bright Data Python SDK - Modern async-first SDK for Bright Data APIs.""" -__version__ = "2.1.0" +from importlib.metadata import version, PackageNotFoundError + +try: + __version__ = version("brightdata-sdk") +except PackageNotFoundError: + # Package not installed (development mode without pip install -e) + __version__ = "0.0.0.dev" # Export main client (async) from .client import BrightDataClient diff --git a/src/brightdata/_version.py b/src/brightdata/_version.py deleted file mode 100644 index b627d86..0000000 --- a/src/brightdata/_version.py +++ /dev/null @@ -1,3 +0,0 @@ -"""Version information.""" - -__version__ = "2.1.0" diff --git a/src/brightdata/cli/main.py b/src/brightdata/cli/main.py index 5a97b3f..aec213a 100644 --- a/src/brightdata/cli/main.py +++ b/src/brightdata/cli/main.py @@ -7,13 +7,14 @@ import click import sys +from brightdata import __version__ from .commands import scrape_group, search_group from .banner import print_banner from .utils import handle_error @click.group(invoke_without_command=True) -@click.version_option(version="2.1.0", prog_name="brightdata") +@click.version_option(version=__version__, prog_name="brightdata") @click.option("--banner/--no-banner", default=True, help="Show/hide banner on startup") @click.pass_context def cli(ctx: click.Context, banner: bool) -> None: diff --git a/src/brightdata/core/engine.py b/src/brightdata/core/engine.py index 0887831..e9b0858 100644 --- a/src/brightdata/core/engine.py +++ b/src/brightdata/core/engine.py @@ -5,6 +5,7 @@ import ssl import warnings from typing import Optional, Dict, Any +from .. import __version__ from ..exceptions import AuthenticationError, NetworkError, TimeoutError, SSLError from ..constants import HTTP_UNAUTHORIZED, HTTP_FORBIDDEN from ..utils.ssl_helpers import is_ssl_certificate_error, get_ssl_error_message @@ -92,7 +93,7 @@ async def __aenter__(self): headers={ "Authorization": f"Bearer {self.bearer_token}", "Content-Type": "application/json", - "User-Agent": "brightdata-sdk/2.1.0", + "User-Agent": f"brightdata-sdk/{__version__}", }, ) diff --git a/src/brightdata/models.py b/src/brightdata/models.py index 2fd1233..e184751 100644 --- a/src/brightdata/models.py +++ b/src/brightdata/models.py @@ -10,7 +10,7 @@ StatusType = Literal["ready", "error", "timeout", "in_progress"] -PlatformType = Optional[Literal["linkedin", "amazon", "chatgpt"]] +PlatformType = Optional[Literal["linkedin", "amazon", "chatgpt", "instagram", "facebook"]] SearchEngineType = Optional[Literal["google", "bing", "yandex"]] diff --git a/src/brightdata/scrapers/api_client.py b/src/brightdata/scrapers/api_client.py index fbd2b3b..9c65b9c 100644 --- a/src/brightdata/scrapers/api_client.py +++ b/src/brightdata/scrapers/api_client.py @@ -45,6 +45,7 @@ async def trigger( dataset_id: str, include_errors: bool = True, sdk_function: Optional[str] = None, + extra_params: Optional[Dict[str, str]] = None, ) -> Optional[str]: """ Trigger dataset collection and get snapshot_id. @@ -54,6 +55,8 @@ async def trigger( dataset_id: Bright Data dataset identifier include_errors: Include error records in results sdk_function: SDK function name for monitoring + extra_params: Additional query parameters (e.g., for discovery endpoints: + {"type": "discover_new", "discover_by": "user_name"}) Returns: snapshot_id if successful, None otherwise @@ -69,6 +72,9 @@ async def trigger( if sdk_function: params["sdk_function"] = sdk_function + if extra_params: + params.update(extra_params) + async with self.engine.post_to_url( self.TRIGGER_URL, json_data=payload, params=params ) as response: diff --git a/src/brightdata/scrapers/base.py b/src/brightdata/scrapers/base.py index ece8f74..dffc5f0 100644 --- a/src/brightdata/scrapers/base.py +++ b/src/brightdata/scrapers/base.py @@ -246,7 +246,11 @@ def _build_scrape_payload(self, urls: List[str], **kwargs) -> List[Dict[str, Any # ============================================================================ async def _trigger_scrape_async( - self, urls: Union[str, List[str]], sdk_function: Optional[str] = None, **kwargs + self, + urls: Union[str, List[str]], + dataset_id: Optional[str] = None, + sdk_function: Optional[str] = None, + **kwargs, ) -> ScrapeJob: """ Trigger scrape job (internal async method). @@ -257,6 +261,7 @@ async def _trigger_scrape_async( Args: urls: URL or list of URLs to scrape + dataset_id: Optional dataset ID (defaults to self.DATASET_ID if not provided) sdk_function: SDK function name for monitoring **kwargs: Additional platform-specific parameters @@ -278,10 +283,10 @@ async def _trigger_scrape_async( # Build payload payload = self._build_scrape_payload(url_list, **kwargs) - # Trigger via API + # Trigger via API (use provided dataset_id or fall back to class default) snapshot_id = await self.api_client.trigger( payload=payload, - dataset_id=self.DATASET_ID, + dataset_id=dataset_id or self.DATASET_ID, include_errors=True, sdk_function=sdk_function, ) @@ -298,10 +303,18 @@ async def _trigger_scrape_async( ) def _trigger_scrape( - self, urls: Union[str, List[str]], sdk_function: Optional[str] = None, **kwargs + self, + urls: Union[str, List[str]], + dataset_id: Optional[str] = None, + sdk_function: Optional[str] = None, + **kwargs, ) -> ScrapeJob: """Trigger scrape job (internal sync wrapper).""" - return _run_blocking(self._trigger_scrape_async(urls, sdk_function=sdk_function, **kwargs)) + return _run_blocking( + self._trigger_scrape_async( + urls, dataset_id=dataset_id, sdk_function=sdk_function, **kwargs + ) + ) async def _check_status_async(self, snapshot_id: str) -> str: """ diff --git a/src/brightdata/scrapers/instagram/__init__.py b/src/brightdata/scrapers/instagram/__init__.py index a9a51ee..847954f 100644 --- a/src/brightdata/scrapers/instagram/__init__.py +++ b/src/brightdata/scrapers/instagram/__init__.py @@ -1,4 +1,4 @@ -"""Instagram scraper for profiles, posts, comments, and reels.""" +"""Instagram scrapers for URL-based and parameter-based extraction.""" from .scraper import InstagramScraper from .search import InstagramSearchScraper diff --git a/src/brightdata/scrapers/instagram/scraper.py b/src/brightdata/scrapers/instagram/scraper.py index c27374c..b7c45df 100644 --- a/src/brightdata/scrapers/instagram/scraper.py +++ b/src/brightdata/scrapers/instagram/scraper.py @@ -1,119 +1,167 @@ """ -Instagram Scraper - URL-based extraction for profiles, posts, comments, and reels. +Instagram URL-based scraper for extracting data from Instagram URLs. -This module contains the InstagramScraper class which provides URL-based extraction -for Instagram profiles, posts, comments, and reels. All methods use the standard -async workflow (trigger/poll/fetch). - -API Specifications: -- client.scrape.instagram.profiles(url, timeout=240) -- client.scrape.instagram.posts(url, timeout=240) -- client.scrape.instagram.comments(url, timeout=240) -- client.scrape.instagram.reels(url, timeout=240) - -All methods accept: -- url: str | list (required) - Single URL or list of URLs -- timeout: int (default: 240) - Maximum wait time in seconds for polling - -For discovery/search operations, see search.py which contains InstagramSearchScraper. +Supports: +- Profile extraction from profile URLs +- Post extraction from post URLs +- Reel extraction from reel URLs +- Comment extraction from post/reel URLs """ import asyncio -from typing import Union, List, Optional, Any +from typing import List, Any, Union from ..base import BaseWebScraper from ..registry import register from ..job import ScrapeJob from ...models import ScrapeResult +from ...constants import ( + COST_PER_RECORD_INSTAGRAM, + DEFAULT_TIMEOUT_SHORT, + DEFAULT_POLL_INTERVAL, +) from ...utils.validation import validate_url, validate_url_list from ...utils.function_detection import get_caller_function_name -from ...constants import DEFAULT_POLL_INTERVAL, DEFAULT_TIMEOUT_MEDIUM, COST_PER_RECORD_INSTAGRAM @register("instagram") class InstagramScraper(BaseWebScraper): """ - Instagram scraper for URL-based extraction. + Instagram scraper for URL-based data extraction. - Extracts structured data from Instagram URLs for: - - Profiles (by profile URL) - - Posts (by post URL) - - Comments (by post URL) - - Reels (by reel URL) + Extracts structured data from Instagram URLs including profiles, + posts, reels, and comments. Example: - >>> scraper = InstagramScraper(bearer_token="token") - >>> - >>> # Async usage - >>> result = await scraper.profiles( - ... url="https://instagram.com/username", - ... timeout=240 - ... ) - >>> - >>> # Sync usage - >>> result = scraper.profiles_sync( - ... url="https://instagram.com/username", - ... timeout=240 - ... ) + >>> async with InstagramScraper(bearer_token="...") as scraper: + ... result = await scraper.profiles("https://instagram.com/nasa") + ... print(result.data) """ - # Instagram dataset IDs - DATASET_ID = "gd_l1vikfch901nx3by4" # Default: Profiles - DATASET_ID_PROFILES = "gd_l1vikfch901nx3by4" # Profiles by URL - DATASET_ID_POSTS = "gd_lk5ns7kz21pck8jpis" # Posts by URL - DATASET_ID_COMMENTS = "gd_ltppn085pokosxh13" # Comments by Post URL - DATASET_ID_REELS = "gd_lyclm20il4r5helnj" # Reels by URL + # Dataset IDs for different content types + DATASET_ID = "gd_l1vikfch901nx3by4" # Profiles (default) + DATASET_ID_POSTS = "gd_lk5ns7kz21pck8jpis" + DATASET_ID_REELS = "gd_lyclm20il4r5helnj" + DATASET_ID_COMMENTS = "gd_ltppn085pokosxh13" + # Platform configuration PLATFORM_NAME = "instagram" - MIN_POLL_TIMEOUT = DEFAULT_TIMEOUT_MEDIUM - COST_PER_RECORD = COST_PER_RECORD_INSTAGRAM + MIN_POLL_TIMEOUT = DEFAULT_TIMEOUT_SHORT # 180s + COST_PER_RECORD = COST_PER_RECORD_INSTAGRAM # 0.002 # ============================================================================ - # PROFILES API - By URL + # INTERNAL HELPERS # ============================================================================ - async def profiles( + async def _scrape_urls( self, url: Union[str, List[str]], - timeout: int = DEFAULT_TIMEOUT_MEDIUM, + dataset_id: str, + timeout: int, ) -> Union[ScrapeResult, List[ScrapeResult]]: """ - Collect profile details from Instagram profile URL (async). - - Collects comprehensive data about an Instagram profile including business - and engagement information, posts, and user details. + Internal method to scrape URLs with specified dataset. Args: - url: Instagram profile URL or list of URLs (required) - timeout: Maximum wait time in seconds for polling (default: 240) + url: Single URL or list of URLs to scrape + dataset_id: Bright Data dataset identifier for this content type + timeout: Maximum seconds to wait for results Returns: - ScrapeResult or List[ScrapeResult] with profile data - - Example: - >>> result = await scraper.profiles( - ... url="https://instagram.com/username", - ... timeout=240 - ... ) + ScrapeResult for single URL, List[ScrapeResult] for multiple URLs """ - if isinstance(url, str): + # Normalize input + is_single = isinstance(url, str) + url_list = [url] if is_single else url + + # Validate + if is_single: validate_url(url) else: - validate_url_list(url) + validate_url_list(url_list) + + # Build simple payload + payload = [{"url": u} for u in url_list] + + # Get SDK function name for tracking + sdk_function = get_caller_function_name() + + # Execute workflow + result = await self.workflow_executor.execute( + payload=payload, + dataset_id=dataset_id, + poll_interval=DEFAULT_POLL_INTERVAL, + poll_timeout=timeout, + include_errors=True, + sdk_function=sdk_function, + normalize_func=self.normalize_result, + ) + # Transform result based on input type + if is_single and isinstance(result.data, list) and len(result.data) == 1: + # Single URL: unwrap single item + result.url = url if isinstance(url, str) else url[0] + result.data = result.data[0] + return result + elif not is_single and isinstance(result.data, list): + # Multiple URLs: create individual ScrapeResult for each + results = [] + for url_item, data_item in zip(url_list, result.data): + individual_result = ScrapeResult( + success=True, + data=data_item, + url=url_item, + error=None, + platform=result.platform, + method=result.method, + trigger_sent_at=result.trigger_sent_at, + snapshot_id_received_at=result.snapshot_id_received_at, + snapshot_polled_at=result.snapshot_polled_at, + data_fetched_at=result.data_fetched_at, + snapshot_id=result.snapshot_id, + cost=result.cost / len(result.data) if result.cost else None, + ) + results.append(individual_result) + return results + + return result + + # ============================================================================ + # PROFILES (URL-based extraction) + # ============================================================================ + + async def profiles( + self, + url: Union[str, List[str]], + timeout: int = DEFAULT_TIMEOUT_SHORT, + ) -> Union[ScrapeResult, List[ScrapeResult]]: + """ + Extract profile data from Instagram profile URLs. + + Args: + url: Profile URL or list of profile URLs + Example: "https://www.instagram.com/nasa/" + timeout: Maximum seconds to wait (default: 180) + + Returns: + ScrapeResult for single URL, List[ScrapeResult] for multiple URLs + + Example: + >>> result = await scraper.profiles("https://instagram.com/nasa") + >>> print(result.data["followers"]) + """ return await self._scrape_urls( url=url, - dataset_id=self.DATASET_ID_PROFILES, + dataset_id=self.DATASET_ID, timeout=timeout, - sdk_function="profiles", ) def profiles_sync( self, url: Union[str, List[str]], - timeout: int = DEFAULT_TIMEOUT_MEDIUM, + timeout: int = DEFAULT_TIMEOUT_SHORT, ) -> Union[ScrapeResult, List[ScrapeResult]]: - """Collect profile details from Instagram profile URL (sync wrapper).""" + """Synchronous version of profiles(). See profiles() for documentation.""" async def _run(): async with self.engine: @@ -121,84 +169,120 @@ async def _run(): return asyncio.run(_run()) - # --- Trigger Interface (Manual Control) --- + async def profiles_trigger(self, url: Union[str, List[str]]) -> ScrapeJob: + """ + Trigger profile extraction job without waiting for results. + + Args: + url: Profile URL or list of profile URLs - async def profiles_trigger(self, url: Union[str, List[str]]) -> "ScrapeJob": - """Trigger Instagram profiles scrape (async - manual control).""" + Returns: + ScrapeJob for status checking and result fetching + Example: + >>> job = await scraper.profiles_trigger("https://instagram.com/nasa") + >>> status = await job.status() + >>> if status == "ready": + ... data = await job.fetch() + """ sdk_function = get_caller_function_name() return await self._trigger_scrape_async( urls=url, - dataset_id=self.DATASET_ID_PROFILES, - sdk_function=sdk_function or "profiles_trigger", + dataset_id=self.DATASET_ID, + sdk_function=sdk_function, ) - def profiles_trigger_sync(self, url: Union[str, List[str]]) -> "ScrapeJob": - """Trigger Instagram profiles scrape (sync wrapper).""" - return asyncio.run(self.profiles_trigger(url)) + def profiles_trigger_sync(self, url: Union[str, List[str]]) -> ScrapeJob: + """Synchronous version of profiles_trigger().""" + + async def _run(): + async with self.engine: + return await self.profiles_trigger(url) + + return asyncio.run(_run()) async def profiles_status(self, snapshot_id: str) -> str: - """Check Instagram profiles status (async).""" + """Check status of a profiles extraction job.""" return await self._check_status_async(snapshot_id) def profiles_status_sync(self, snapshot_id: str) -> str: - """Check Instagram profiles status (sync wrapper).""" - return asyncio.run(self.profiles_status(snapshot_id)) + """Synchronous version of profiles_status().""" + + async def _run(): + async with self.engine: + return await self.profiles_status(snapshot_id) + + return asyncio.run(_run()) async def profiles_fetch(self, snapshot_id: str) -> Any: - """Fetch Instagram profiles results (async).""" + """Fetch results of a completed profiles extraction job.""" return await self._fetch_results_async(snapshot_id) def profiles_fetch_sync(self, snapshot_id: str) -> Any: - """Fetch Instagram profiles results (sync wrapper).""" - return asyncio.run(self.profiles_fetch(snapshot_id)) + """Synchronous version of profiles_fetch().""" + + async def _run(): + async with self.engine: + return await self.profiles_fetch(snapshot_id) + + return asyncio.run(_run()) # ============================================================================ - # POSTS API - By URL + # POSTS (URL-based extraction) # ============================================================================ async def posts( self, url: Union[str, List[str]], - timeout: int = DEFAULT_TIMEOUT_MEDIUM, + timeout: int = DEFAULT_TIMEOUT_SHORT, ) -> Union[ScrapeResult, List[ScrapeResult]]: """ - Collect detailed data from Instagram post URLs (async). - - Collects comprehensive data from Instagram posts including post details, - page/profile details, and attachments/media. + Extract post data from Instagram post URLs. Args: - url: Instagram post URL or list of URLs (required) - timeout: Maximum wait time in seconds for polling (default: 240) + url: Post URL or list of post URLs + Example: "https://www.instagram.com/p/Cuf4s0MNqNr" + timeout: Maximum seconds to wait (default: 180) Returns: - ScrapeResult or List[ScrapeResult] with post data + ScrapeResult for single URL, List[ScrapeResult] for multiple URLs + + Response data fields: + - post_id (str): Unique post identifier + - shortcode (str): URL shortcode (e.g., "DTGAZJQkg5k") + - content_type (str): "Image", "Video", or "Carousel" + - description (str): Post caption text + - date_posted (str): ISO timestamp of posting + - likes (int): Number of likes + - num_comments (int): Number of comments + - user_posted (str): Username who posted + - user_posted_id (str): User's numeric ID + - profile_url (str): URL to user's profile + - followers (int): User's follower count + - is_verified (bool): Whether user is verified + - photos (list): List of photo URLs + - thumbnail (str): Thumbnail image URL + - post_content (list): Detailed content items with type, url, alt_text + - latest_comments (list): Recent comments with user, text, likes + - is_paid_partnership (bool): Whether post is sponsored Example: - >>> result = await scraper.posts( - ... url="https://instagram.com/p/ABC123", - ... timeout=240 - ... ) + >>> result = await scraper.posts("https://instagram.com/p/ABC123/") + >>> print(result.data["description"]) # Caption + >>> print(result.data["likes"]) # Like count """ - if isinstance(url, str): - validate_url(url) - else: - validate_url_list(url) - return await self._scrape_urls( url=url, dataset_id=self.DATASET_ID_POSTS, timeout=timeout, - sdk_function="posts", ) def posts_sync( self, url: Union[str, List[str]], - timeout: int = DEFAULT_TIMEOUT_MEDIUM, + timeout: int = DEFAULT_TIMEOUT_SHORT, ) -> Union[ScrapeResult, List[ScrapeResult]]: - """Collect detailed data from Instagram post URLs (sync wrapper).""" + """Synchronous version of posts().""" async def _run(): async with self.engine: @@ -206,268 +290,259 @@ async def _run(): return asyncio.run(_run()) - # --- Trigger Interface (Manual Control) --- - - async def posts_trigger(self, url: Union[str, List[str]]) -> "ScrapeJob": - """Trigger Instagram posts scrape (async - manual control).""" - + async def posts_trigger(self, url: Union[str, List[str]]) -> ScrapeJob: + """Trigger post extraction job without waiting for results.""" sdk_function = get_caller_function_name() return await self._trigger_scrape_async( - urls=url, dataset_id=self.DATASET_ID_POSTS, sdk_function=sdk_function or "posts_trigger" + urls=url, + dataset_id=self.DATASET_ID_POSTS, + sdk_function=sdk_function, ) - def posts_trigger_sync(self, url: Union[str, List[str]]) -> "ScrapeJob": - """Trigger Instagram posts scrape (sync wrapper).""" - return asyncio.run(self.posts_trigger(url)) + def posts_trigger_sync(self, url: Union[str, List[str]]) -> ScrapeJob: + """Synchronous version of posts_trigger().""" + + async def _run(): + async with self.engine: + return await self.posts_trigger(url) + + return asyncio.run(_run()) async def posts_status(self, snapshot_id: str) -> str: - """Check Instagram posts status (async).""" + """Check status of a posts extraction job.""" return await self._check_status_async(snapshot_id) def posts_status_sync(self, snapshot_id: str) -> str: - """Check Instagram posts status (sync wrapper).""" - return asyncio.run(self.posts_status(snapshot_id)) + """Synchronous version of posts_status().""" + + async def _run(): + async with self.engine: + return await self.posts_status(snapshot_id) + + return asyncio.run(_run()) async def posts_fetch(self, snapshot_id: str) -> Any: - """Fetch Instagram posts results (async).""" + """Fetch results of a completed posts extraction job.""" return await self._fetch_results_async(snapshot_id) def posts_fetch_sync(self, snapshot_id: str) -> Any: - """Fetch Instagram posts results (sync wrapper).""" - return asyncio.run(self.posts_fetch(snapshot_id)) + """Synchronous version of posts_fetch().""" + + async def _run(): + async with self.engine: + return await self.posts_fetch(snapshot_id) + + return asyncio.run(_run()) # ============================================================================ - # COMMENTS API - By Post URL + # REELS (URL-based extraction) # ============================================================================ - async def comments( + async def reels( self, url: Union[str, List[str]], - timeout: int = DEFAULT_TIMEOUT_MEDIUM, + timeout: int = DEFAULT_TIMEOUT_SHORT, ) -> Union[ScrapeResult, List[ScrapeResult]]: """ - Collect comments from Instagram post URL (async). - - Collects the latest comments from a specific Instagram post (up to 10 comments - with associated metadata). + Extract reel data from Instagram reel URLs. Args: - url: Instagram post URL or list of URLs (required) - timeout: Maximum wait time in seconds for polling (default: 240) + url: Reel URL or list of reel URLs + Example: "https://www.instagram.com/reel/C5Rdyj_q7YN/" + timeout: Maximum seconds to wait (default: 180) Returns: - ScrapeResult or List[ScrapeResult] with comment data + ScrapeResult for single URL, List[ScrapeResult] for multiple URLs + + Response data fields: + - post_id (str): Unique reel identifier + - shortcode (str): URL shortcode (e.g., "DTQygzxD6QC") + - product_type (str): Content type, typically "clips" for reels + - description (str): Reel caption text + - hashtags (list): Hashtags used in caption + - date_posted (str): ISO timestamp of posting + - likes (int): Number of likes + - views (int): Number of views + - video_play_count (int): Number of video plays + - num_comments (int): Number of comments + - length (float): Video duration in seconds + - video_url (str): Direct URL to video file + - audio_url (str): Direct URL to audio track + - thumbnail (str): Thumbnail image URL + - user_posted (str): Username who posted + - user_profile_url (str): URL to user's profile + - followers (int): User's follower count + - is_verified (bool): Whether user is verified + - top_comments (list): Top comments on the reel + - tagged_users (list): Users tagged in the reel + - is_paid_partnership (bool): Whether reel is sponsored Example: - >>> result = await scraper.comments( - ... url="https://instagram.com/p/ABC123", - ... timeout=240 - ... ) + >>> result = await scraper.reels("https://instagram.com/reel/XYZ789/") + >>> print(result.data["views"]) # View count + >>> print(result.data["video_url"]) # Download URL """ - if isinstance(url, str): - validate_url(url) - else: - validate_url_list(url) - return await self._scrape_urls( url=url, - dataset_id=self.DATASET_ID_COMMENTS, + dataset_id=self.DATASET_ID_REELS, timeout=timeout, - sdk_function="comments", ) - def comments_sync( + def reels_sync( self, url: Union[str, List[str]], - timeout: int = DEFAULT_TIMEOUT_MEDIUM, + timeout: int = DEFAULT_TIMEOUT_SHORT, ) -> Union[ScrapeResult, List[ScrapeResult]]: - """Collect comments from Instagram post URL (sync wrapper).""" + """Synchronous version of reels().""" async def _run(): async with self.engine: - return await self.comments(url, timeout) + return await self.reels(url, timeout) return asyncio.run(_run()) - # --- Trigger Interface (Manual Control) --- - - async def comments_trigger(self, url: Union[str, List[str]]) -> "ScrapeJob": - """Trigger Instagram comments scrape (async - manual control).""" - + async def reels_trigger(self, url: Union[str, List[str]]) -> ScrapeJob: + """Trigger reel extraction job without waiting for results.""" sdk_function = get_caller_function_name() return await self._trigger_scrape_async( urls=url, - dataset_id=self.DATASET_ID_COMMENTS, - sdk_function=sdk_function or "comments_trigger", + dataset_id=self.DATASET_ID_REELS, + sdk_function=sdk_function, ) - def comments_trigger_sync(self, url: Union[str, List[str]]) -> "ScrapeJob": - """Trigger Instagram comments scrape (sync wrapper).""" - return asyncio.run(self.comments_trigger(url)) + def reels_trigger_sync(self, url: Union[str, List[str]]) -> ScrapeJob: + """Synchronous version of reels_trigger().""" - async def comments_status(self, snapshot_id: str) -> str: - """Check Instagram comments status (async).""" + async def _run(): + async with self.engine: + return await self.reels_trigger(url) + + return asyncio.run(_run()) + + async def reels_status(self, snapshot_id: str) -> str: + """Check status of a reels extraction job.""" return await self._check_status_async(snapshot_id) - def comments_status_sync(self, snapshot_id: str) -> str: - """Check Instagram comments status (sync wrapper).""" - return asyncio.run(self.comments_status(snapshot_id)) + def reels_status_sync(self, snapshot_id: str) -> str: + """Synchronous version of reels_status().""" - async def comments_fetch(self, snapshot_id: str) -> Any: - """Fetch Instagram comments results (async).""" + async def _run(): + async with self.engine: + return await self.reels_status(snapshot_id) + + return asyncio.run(_run()) + + async def reels_fetch(self, snapshot_id: str) -> Any: + """Fetch results of a completed reels extraction job.""" return await self._fetch_results_async(snapshot_id) - def comments_fetch_sync(self, snapshot_id: str) -> Any: - """Fetch Instagram comments results (sync wrapper).""" - return asyncio.run(self.comments_fetch(snapshot_id)) + def reels_fetch_sync(self, snapshot_id: str) -> Any: + """Synchronous version of reels_fetch().""" + + async def _run(): + async with self.engine: + return await self.reels_fetch(snapshot_id) + + return asyncio.run(_run()) # ============================================================================ - # REELS API - By URL + # COMMENTS (URL-based extraction) # ============================================================================ - async def reels( + async def comments( self, url: Union[str, List[str]], - timeout: int = DEFAULT_TIMEOUT_MEDIUM, + timeout: int = DEFAULT_TIMEOUT_SHORT, ) -> Union[ScrapeResult, List[ScrapeResult]]: """ - Collect detailed data from Instagram reel URLs (async). - - Collects detailed data about Instagram reels from public profiles including - reel details, page/profile details, and attachments/media. + Extract comments from Instagram post or reel URLs. Args: - url: Instagram reel URL or list of URLs (required) - timeout: Maximum wait time in seconds for polling (default: 240) + url: Post/reel URL or list of URLs + Example: "https://www.instagram.com/p/CesFC7JLyFl/" + timeout: Maximum seconds to wait (default: 180) Returns: - ScrapeResult or List[ScrapeResult] with reel data + ScrapeResult for single URL, List[ScrapeResult] for multiple URLs + + Note: Returns a LIST of comment objects in result.data + + Response data fields (per comment): + - comment_id (str): Unique comment identifier + - post_id (str): ID of the post/reel commented on + - post_url (str): URL of the post/reel + - post_user (str): Username of post author + - comment_user (str): Username who wrote the comment + - comment_user_url (str): URL to commenter's profile + - comment (str): The comment text + - comment_date (str): ISO timestamp of comment + - likes_number (int): Number of likes on comment + - replies_number (int): Number of replies to comment Example: - >>> result = await scraper.reels( - ... url="https://instagram.com/reel/ABC123", - ... timeout=240 - ... ) + >>> result = await scraper.comments("https://instagram.com/p/ABC123/") + >>> for comment in result.data: + ... print(f"{comment['comment_user']}: {comment['comment']}") """ - if isinstance(url, str): - validate_url(url) - else: - validate_url_list(url) - return await self._scrape_urls( url=url, - dataset_id=self.DATASET_ID_REELS, + dataset_id=self.DATASET_ID_COMMENTS, timeout=timeout, - sdk_function="reels", ) - def reels_sync( + def comments_sync( self, url: Union[str, List[str]], - timeout: int = DEFAULT_TIMEOUT_MEDIUM, + timeout: int = DEFAULT_TIMEOUT_SHORT, ) -> Union[ScrapeResult, List[ScrapeResult]]: - """Collect detailed data from Instagram reel URLs (sync wrapper).""" + """Synchronous version of comments().""" async def _run(): async with self.engine: - return await self.reels(url, timeout) + return await self.comments(url, timeout) return asyncio.run(_run()) - # --- Trigger Interface (Manual Control) --- - - async def reels_trigger(self, url: Union[str, List[str]]) -> "ScrapeJob": - """Trigger Instagram reels scrape (async - manual control).""" - + async def comments_trigger(self, url: Union[str, List[str]]) -> ScrapeJob: + """Trigger comment extraction job without waiting for results.""" sdk_function = get_caller_function_name() return await self._trigger_scrape_async( - urls=url, dataset_id=self.DATASET_ID_REELS, sdk_function=sdk_function or "reels_trigger" + urls=url, + dataset_id=self.DATASET_ID_COMMENTS, + sdk_function=sdk_function, ) - def reels_trigger_sync(self, url: Union[str, List[str]]) -> "ScrapeJob": - """Trigger Instagram reels scrape (sync wrapper).""" - return asyncio.run(self.reels_trigger(url)) - - async def reels_status(self, snapshot_id: str) -> str: - """Check Instagram reels status (async).""" - return await self._check_status_async(snapshot_id) - - def reels_status_sync(self, snapshot_id: str) -> str: - """Check Instagram reels status (sync wrapper).""" - return asyncio.run(self.reels_status(snapshot_id)) + def comments_trigger_sync(self, url: Union[str, List[str]]) -> ScrapeJob: + """Synchronous version of comments_trigger().""" - async def reels_fetch(self, snapshot_id: str) -> Any: - """Fetch Instagram reels results (async).""" - return await self._fetch_results_async(snapshot_id) - - def reels_fetch_sync(self, snapshot_id: str) -> Any: - """Fetch Instagram reels results (sync wrapper).""" - return asyncio.run(self.reels_fetch(snapshot_id)) + async def _run(): + async with self.engine: + return await self.comments_trigger(url) - # ============================================================================ - # CORE SCRAPING LOGIC - # ============================================================================ + return asyncio.run(_run()) - async def _scrape_urls( - self, - url: Union[str, List[str]], - dataset_id: str, - timeout: int, - sdk_function: Optional[str] = None, - ) -> Union[ScrapeResult, List[ScrapeResult]]: - """ - Scrape URLs using standard async workflow (trigger/poll/fetch). + async def comments_status(self, snapshot_id: str) -> str: + """Check status of a comments extraction job.""" + return await self._check_status_async(snapshot_id) - Args: - url: URL(s) to scrape - dataset_id: Instagram dataset ID - timeout: Maximum wait time in seconds (for polling) - sdk_function: SDK function name for monitoring (auto-detected if not provided) + def comments_status_sync(self, snapshot_id: str) -> str: + """Synchronous version of comments_status().""" - Returns: - ScrapeResult(s) - """ - if sdk_function is None: - sdk_function = get_caller_function_name() + async def _run(): + async with self.engine: + return await self.comments_status(snapshot_id) - is_single = isinstance(url, str) - url_list = [url] if is_single else url + return asyncio.run(_run()) - payload = [{"url": u} for u in url_list] + async def comments_fetch(self, snapshot_id: str) -> Any: + """Fetch results of a completed comments extraction job.""" + return await self._fetch_results_async(snapshot_id) - result = await self.workflow_executor.execute( - payload=payload, - dataset_id=dataset_id, - poll_interval=DEFAULT_POLL_INTERVAL, - poll_timeout=timeout, - include_errors=True, - normalize_func=self.normalize_result, - sdk_function=sdk_function, - ) + def comments_fetch_sync(self, snapshot_id: str) -> Any: + """Synchronous version of comments_fetch().""" - if is_single and isinstance(result.data, list) and len(result.data) == 1: - result.url = url if isinstance(url, str) else url[0] - result.data = result.data[0] - return result - elif not is_single and isinstance(result.data, list): - from ...models import ScrapeResult + async def _run(): + async with self.engine: + return await self.comments_fetch(snapshot_id) - results = [] - for url_item, data_item in zip(url_list, result.data): - results.append( - ScrapeResult( - success=True, - data=data_item, - url=url_item, - platform=result.platform, - method=result.method, - trigger_sent_at=result.trigger_sent_at, - snapshot_id_received_at=result.snapshot_id_received_at, - snapshot_polled_at=result.snapshot_polled_at, - data_fetched_at=result.data_fetched_at, - snapshot_id=result.snapshot_id, - cost=result.cost / len(result.data) if result.cost else None, - ) - ) - return results - return result + return asyncio.run(_run()) diff --git a/src/brightdata/scrapers/instagram/search.py b/src/brightdata/scrapers/instagram/search.py index 43b347f..bc80aa0 100644 --- a/src/brightdata/scrapers/instagram/search.py +++ b/src/brightdata/scrapers/instagram/search.py @@ -1,126 +1,304 @@ """ -Instagram Search Scraper - Discovery/parameter-based operations. +Instagram parameter-based discovery scraper. -Implements: -- client.search.instagram.posts() - Discover posts by profile URL with filters -- client.search.instagram.reels() - Discover reels by profile or search URL with filters +Supports: +- Profile discovery by username +- Posts discovery from profile with filters +- Reels discovery from profile with filters """ import asyncio -from typing import Union, List, Optional, Dict, Any +import os +from typing import List, Dict, Any, Optional, Union +from ..api_client import DatasetAPIClient +from ..workflow import WorkflowExecutor from ...core.engine import AsyncEngine from ...models import ScrapeResult -from ...utils.validation import validate_url, validate_url_list +from ...exceptions import ValidationError +from ...constants import ( + COST_PER_RECORD_INSTAGRAM, + DEFAULT_TIMEOUT_SHORT, + DEFAULT_POLL_INTERVAL, +) +from ...utils.validation import validate_url_list, validate_instagram_date from ...utils.function_detection import get_caller_function_name -from ...constants import DEFAULT_POLL_INTERVAL, DEFAULT_TIMEOUT_MEDIUM, COST_PER_RECORD_INSTAGRAM -from ..api_client import DatasetAPIClient -from ..workflow import WorkflowExecutor class InstagramSearchScraper: """ - Instagram Search Scraper for parameter-based discovery. + Instagram scraper for parameter-based content discovery. - Provides discovery methods that search Instagram by parameters - rather than extracting from specific URLs. This is a parallel component - to InstagramScraper, both doing Instagram data extraction but with - different approaches (parameter-based vs URL-based). + Unlike InstagramScraper (URL-based), this class discovers content + using parameters like username, date ranges, and filters. Example: - >>> scraper = InstagramSearchScraper(bearer_token="token") - >>> - >>> # Async usage + >>> scraper = InstagramSearchScraper(bearer_token="...") + >>> result = await scraper.profiles("nasa") # Find by username >>> result = await scraper.posts( - ... url="https://instagram.com/username", + ... url="https://instagram.com/nasa", ... num_of_posts=10, - ... post_type="reel" - ... ) - >>> - >>> # Sync usage - >>> result = scraper.posts_sync( - ... url="https://instagram.com/username", - ... num_of_posts=10, - ... post_type="reel" + ... start_date="01-01-2025" ... ) """ - # Dataset IDs for discovery endpoints - DATASET_ID_POSTS_DISCOVER = "gd_lk5ns7kz21pck8jpis" # Posts discover by URL - DATASET_ID_REELS_DISCOVER = "gd_lyclm20il4r5helnj" # Reels discover by URL + # Dataset IDs + DATASET_ID_PROFILES = "gd_l1vikfch901nx3by4" + DATASET_ID_POSTS = "gd_lk5ns7kz21pck8jpis" + DATASET_ID_REELS = "gd_lyclm20il4r5helnj" + + # Platform configuration + PLATFORM_NAME = "instagram" + MIN_POLL_TIMEOUT = DEFAULT_TIMEOUT_SHORT + COST_PER_RECORD = COST_PER_RECORD_INSTAGRAM - def __init__(self, bearer_token: str, engine: Optional[AsyncEngine] = None): + def __init__( + self, + bearer_token: Optional[str] = None, + engine: Optional[AsyncEngine] = None, + ): """ Initialize Instagram search scraper. Args: - bearer_token: Bright Data API token - engine: Optional AsyncEngine instance. If not provided, creates a new one. - Allows dependency injection for testing and flexibility. + bearer_token: Bright Data API token. If None, loads from environment. + engine: Optional AsyncEngine instance for connection reuse. """ - self.bearer_token = bearer_token - self.engine = engine if engine is not None else AsyncEngine(bearer_token) + self.bearer_token = bearer_token or os.getenv("BRIGHTDATA_API_TOKEN") + if not self.bearer_token: + raise ValidationError( + "Bearer token required for Instagram search. " + "Provide bearer_token parameter or set BRIGHTDATA_API_TOKEN environment variable." + ) + + # Reuse engine if provided, otherwise create new + self.engine = engine if engine is not None else AsyncEngine(self.bearer_token) self.api_client = DatasetAPIClient(self.engine) self.workflow_executor = WorkflowExecutor( api_client=self.api_client, - platform_name="instagram", - cost_per_record=COST_PER_RECORD_INSTAGRAM, + platform_name=self.PLATFORM_NAME, + cost_per_record=self.COST_PER_RECORD, ) # ============================================================================ - # POSTS DISCOVERY (by profile URL with filters) + # CONTEXT MANAGER SUPPORT + # ============================================================================ + + async def __aenter__(self): + """Async context manager entry.""" + await self.engine.__aenter__() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Async context manager exit.""" + await self.engine.__aexit__(exc_type, exc_val, exc_tb) + + # ============================================================================ + # INTERNAL HELPERS + # ============================================================================ + + async def _execute_discovery( + self, + payload: List[Dict[str, Any]], + dataset_id: str, + discover_by: str, + timeout: int, + ) -> ScrapeResult: + """ + Execute discovery operation with extra query parameters. + + Args: + payload: Request payload + dataset_id: Bright Data dataset identifier + discover_by: Discovery type (user_name, url, url_all_reels) + timeout: Maximum seconds to wait + + Returns: + ScrapeResult with discovered data + """ + sdk_function = get_caller_function_name() + + # Build extra params for discovery endpoints + extra_params = { + "type": "discover_new", + "discover_by": discover_by, + } + + # Use workflow_executor.execute() with extra_params support + result = await self.workflow_executor.execute( + payload=payload, + dataset_id=dataset_id, + poll_interval=DEFAULT_POLL_INTERVAL, + poll_timeout=timeout, + include_errors=True, + sdk_function=sdk_function, + extra_params=extra_params, + ) + + return result + + # ============================================================================ + # PROFILES DISCOVERY (by username) + # ============================================================================ + + async def profiles( + self, + user_name: Union[str, List[str]], + timeout: int = DEFAULT_TIMEOUT_SHORT, + ) -> ScrapeResult: + """ + Discover Instagram profiles by username (exact match). + + Args: + user_name: Username or list of usernames (without @) + Example: "nasa" or ["nasa", "spacex"] + timeout: Maximum seconds to wait (default: 180) + + Returns: + ScrapeResult with profile data + + Response data fields: + - account (str): Username/handle + - id (str): Numeric user ID + - full_name (str): Display name + - profile_name (str): Profile display name + - profile_url (str): URL to profile + - profile_image_link (str): Profile picture URL + - followers (int): Follower count + - following (int): Following count + - posts_count (int): Number of posts + - highlights_count (int): Number of highlights + - is_verified (bool): Verification status + - is_private (bool): Whether account is private + - is_business_account (bool): Business account flag + - is_professional_account (bool): Professional account flag + - biography (str): Bio text + - bio_hashtags (list): Hashtags in bio + - category_name (str): Account category + - external_url (str): Link in bio + - avg_engagement (float): Average engagement rate + - posts (list): Recent posts data + - highlights (list): Highlights data + - related_accounts (list): Similar accounts + + Example: + >>> result = await scraper.profiles("nasa") + >>> print(result.data["followers"]) # 97896265 + >>> print(result.data["biography"]) # Bio text + """ + # Normalize to list + user_names = [user_name] if isinstance(user_name, str) else user_name + + # Build payload - IMPORTANT: field is "user_name" with underscore + payload = [{"user_name": name} for name in user_names] + + return await self._execute_discovery( + payload=payload, + dataset_id=self.DATASET_ID_PROFILES, + discover_by="user_name", + timeout=timeout, + ) + + def profiles_sync( + self, + user_name: Union[str, List[str]], + timeout: int = DEFAULT_TIMEOUT_SHORT, + ) -> ScrapeResult: + """Synchronous version of profiles().""" + + async def _run(): + async with self.engine: + return await self.profiles(user_name, timeout) + + return asyncio.run(_run()) + + # ============================================================================ + # POSTS DISCOVERY (by profile URL + filters) # ============================================================================ async def posts( self, url: Union[str, List[str]], num_of_posts: Optional[int] = None, - posts_to_not_include: Optional[List[str]] = None, start_date: Optional[str] = None, end_date: Optional[str] = None, post_type: Optional[str] = None, - timeout: int = DEFAULT_TIMEOUT_MEDIUM, - ) -> Union[ScrapeResult, List[ScrapeResult]]: + posts_to_not_include: Optional[List[str]] = None, + timeout: int = DEFAULT_TIMEOUT_SHORT, + ) -> ScrapeResult: """ - Discover recent Instagram posts from a public profile (async). - - Discovers posts from Instagram profiles, reels, or search URLs with - filtering options by date range, exclusion of specific posts, and post type. + Discover posts from Instagram profile with optional filters. Args: - url: Instagram profile, reel, or search URL (required) - num_of_posts: Number of recent posts to collect (optional, no limit if omitted) - posts_to_not_include: Array of post IDs to exclude from results - start_date: Start date for filtering posts in MM-DD-YYYY format - end_date: End date for filtering posts in MM-DD-YYYY format - post_type: Type of posts to collect (e.g., "post", "reel") - timeout: Maximum wait time in seconds for polling (default: 240) + url: Profile URL or list of profile URLs + Example: "https://www.instagram.com/nasa/" + num_of_posts: Maximum number of posts to return + start_date: Filter posts on or after this date (format: MM-DD-YYYY) + end_date: Filter posts on or before this date (format: MM-DD-YYYY) + post_type: Filter by type - "Post" or "Reel" + posts_to_not_include: List of post IDs to exclude + timeout: Maximum seconds to wait (default: 180) Returns: - ScrapeResult or List[ScrapeResult] with discovered posts + ScrapeResult with discovered posts (list in result.data) + + Response data fields (per post): + - post_id (str): Unique post identifier + - shortcode (str): URL shortcode + - content_type (str): "Image", "Video", or "Carousel" + - description (str): Post caption text + - date_posted (str): ISO timestamp of posting + - likes (int): Number of likes + - num_comments (int): Number of comments + - user_posted (str): Username who posted + - photos (list): List of photo URLs + - thumbnail (str): Thumbnail image URL + - is_paid_partnership (bool): Whether post is sponsored Example: >>> result = await scraper.posts( - ... url="https://instagram.com/username", + ... url="https://instagram.com/nasa", ... num_of_posts=10, ... start_date="01-01-2025", - ... end_date="12-31-2025", - ... post_type="reel" + ... post_type="Post" ... ) + >>> for post in result.data: + ... print(post["description"]) """ - if isinstance(url, str): - validate_url(url) - else: - validate_url_list(url) - - return await self._discover_with_params( - url=url, - dataset_id=self.DATASET_ID_POSTS_DISCOVER, - num_of_posts=num_of_posts, - posts_to_not_include=posts_to_not_include, - start_date=start_date, - end_date=end_date, - post_type=post_type, + # Normalize URL to list + urls = [url] if isinstance(url, str) else url + + # Validate URLs + validate_url_list(urls) + + # Validate dates if provided + if start_date: + validate_instagram_date(start_date) + if end_date: + validate_instagram_date(end_date) + + # Build payload - omit None values (don't send empty strings) + payload = [] + for u in urls: + item: Dict[str, Any] = {"url": u} + + if num_of_posts is not None: + item["num_of_posts"] = num_of_posts + if start_date: + item["start_date"] = start_date + if end_date: + item["end_date"] = end_date + if post_type: + item["post_type"] = post_type + if posts_to_not_include: + item["posts_to_not_include"] = posts_to_not_include + + payload.append(item) + + return await self._execute_discovery( + payload=payload, + dataset_id=self.DATASET_ID_POSTS, + discover_by="url", timeout=timeout, ) @@ -128,190 +306,210 @@ def posts_sync( self, url: Union[str, List[str]], num_of_posts: Optional[int] = None, - posts_to_not_include: Optional[List[str]] = None, start_date: Optional[str] = None, end_date: Optional[str] = None, post_type: Optional[str] = None, - timeout: int = DEFAULT_TIMEOUT_MEDIUM, - ) -> Union[ScrapeResult, List[ScrapeResult]]: - """Discover recent Instagram posts from a public profile (sync wrapper).""" + posts_to_not_include: Optional[List[str]] = None, + timeout: int = DEFAULT_TIMEOUT_SHORT, + ) -> ScrapeResult: + """Synchronous version of posts().""" async def _run(): async with self.engine: return await self.posts( url, num_of_posts, - posts_to_not_include, start_date, end_date, post_type, + posts_to_not_include, timeout, ) return asyncio.run(_run()) # ============================================================================ - # REELS DISCOVERY (by profile or search URL with filters) + # REELS DISCOVERY (by profile URL) # ============================================================================ async def reels( self, url: Union[str, List[str]], num_of_posts: Optional[int] = None, - posts_to_not_include: Optional[List[str]] = None, start_date: Optional[str] = None, end_date: Optional[str] = None, - timeout: int = DEFAULT_TIMEOUT_MEDIUM, - ) -> Union[ScrapeResult, List[ScrapeResult]]: + timeout: int = DEFAULT_TIMEOUT_SHORT, + ) -> ScrapeResult: """ - Discover Instagram Reels from profile or search URL (async). - - Discovers Instagram Reels videos from a profile URL or direct search URL - with filtering options by date range and exclusion of specific posts. + Discover reels from Instagram profile. Args: - url: Instagram profile or direct search URL (required) - num_of_posts: Number of recent reels to collect (optional, no limit if omitted) - posts_to_not_include: Array of post IDs to exclude from results - start_date: Start date for filtering reels in MM-DD-YYYY format - end_date: End date for filtering reels in MM-DD-YYYY format - timeout: Maximum wait time in seconds for polling (default: 240) + url: Profile URL or list of profile URLs + num_of_posts: Maximum number of reels to return + start_date: Filter reels on or after this date (format: MM-DD-YYYY) + end_date: Filter reels on or before this date (format: MM-DD-YYYY) + timeout: Maximum seconds to wait (default: 180) Returns: - ScrapeResult or List[ScrapeResult] with discovered reels + ScrapeResult with discovered reels (list in result.data) + + Response data fields (per reel): + - post_id (str): Unique reel identifier + - shortcode (str): URL shortcode + - product_type (str): Content type ("clips") + - description (str): Reel caption text + - date_posted (str): ISO timestamp of posting + - likes (int): Number of likes + - views (int): Number of views + - video_play_count (int): Number of video plays + - num_comments (int): Number of comments + - length (float): Video duration in seconds + - video_url (str): Direct URL to video file + - thumbnail (str): Thumbnail image URL + - user_posted (str): Username who posted Example: >>> result = await scraper.reels( - ... url="https://instagram.com/username", - ... num_of_posts=50, - ... start_date="01-01-2025", - ... end_date="12-31-2025", - ... timeout=240 + ... url="https://instagram.com/nasa", + ... num_of_posts=5 ... ) + >>> for reel in result.data: + ... print(f"{reel['views']} views") """ - if isinstance(url, str): - validate_url(url) - else: - validate_url_list(url) - - return await self._discover_with_params( - url=url, - dataset_id=self.DATASET_ID_REELS_DISCOVER, - num_of_posts=num_of_posts, - posts_to_not_include=posts_to_not_include, - start_date=start_date, - end_date=end_date, + # Normalize and validate + urls = [url] if isinstance(url, str) else url + validate_url_list(urls) + + if start_date: + validate_instagram_date(start_date) + if end_date: + validate_instagram_date(end_date) + + # Build payload + payload = [] + for u in urls: + item: Dict[str, Any] = {"url": u} + if num_of_posts is not None: + item["num_of_posts"] = num_of_posts + if start_date: + item["start_date"] = start_date + if end_date: + item["end_date"] = end_date + payload.append(item) + + return await self._execute_discovery( + payload=payload, + dataset_id=self.DATASET_ID_REELS, + discover_by="url", timeout=timeout, - sdk_function="reels", ) def reels_sync( self, url: Union[str, List[str]], num_of_posts: Optional[int] = None, - posts_to_not_include: Optional[List[str]] = None, start_date: Optional[str] = None, end_date: Optional[str] = None, - timeout: int = DEFAULT_TIMEOUT_MEDIUM, - ) -> Union[ScrapeResult, List[ScrapeResult]]: - """Discover Instagram Reels from profile or search URL (sync wrapper).""" + timeout: int = DEFAULT_TIMEOUT_SHORT, + ) -> ScrapeResult: + """Synchronous version of reels().""" async def _run(): async with self.engine: - return await self.reels( - url, num_of_posts, posts_to_not_include, start_date, end_date, timeout - ) + return await self.reels(url, num_of_posts, start_date, end_date, timeout) return asyncio.run(_run()) # ============================================================================ - # CORE DISCOVERY LOGIC + # REELS ALL DISCOVERY (by profile URL - all reels) # ============================================================================ - async def _discover_with_params( + async def reels_all( self, url: Union[str, List[str]], - dataset_id: str, num_of_posts: Optional[int] = None, - posts_to_not_include: Optional[List[str]] = None, start_date: Optional[str] = None, end_date: Optional[str] = None, - post_type: Optional[str] = None, - timeout: int = DEFAULT_TIMEOUT_MEDIUM, - sdk_function: Optional[str] = None, - ) -> Union[ScrapeResult, List[ScrapeResult]]: + timeout: int = DEFAULT_TIMEOUT_SHORT, + ) -> ScrapeResult: """ - Discover content with additional parameters using standard async workflow. + Discover ALL reels from Instagram profile. + + This differs from reels() by using discover_by=url_all_reels, + which may return more comprehensive results including archived reels. Args: - url: URL(s) to discover from - dataset_id: Instagram dataset ID - num_of_posts: Number of posts to collect - posts_to_not_include: Post IDs to exclude - start_date: Start date filter (MM-DD-YYYY) - end_date: End date filter (MM-DD-YYYY) - post_type: Type of posts to collect (for posts discovery only) - timeout: Maximum wait time in seconds + url: Profile URL or list of profile URLs + num_of_posts: Maximum number of reels to return + start_date: Filter reels on or after this date (format: MM-DD-YYYY) + end_date: Filter reels on or before this date (format: MM-DD-YYYY) + timeout: Maximum seconds to wait (default: 180) Returns: - ScrapeResult(s) + ScrapeResult with discovered reels (list in result.data) + + Response data fields (per reel): + - post_id (str): Unique reel identifier + - shortcode (str): URL shortcode + - product_type (str): Content type ("clips") + - description (str): Reel caption text + - date_posted (str): ISO timestamp of posting + - likes (int): Number of likes + - views (int): Number of views + - video_play_count (int): Number of video plays + - num_comments (int): Number of comments + - length (float): Video duration in seconds + - video_url (str): Direct URL to video file + - thumbnail (str): Thumbnail image URL + - user_posted (str): Username who posted + + Example: + >>> result = await scraper.reels_all( + ... url="https://instagram.com/nasa", + ... num_of_posts=20 + ... ) """ - is_single = isinstance(url, str) - url_list = [url] if is_single else url + # Normalize and validate + urls = [url] if isinstance(url, str) else url + validate_url_list(urls) + + if start_date: + validate_instagram_date(start_date) + if end_date: + validate_instagram_date(end_date) + # Build payload payload = [] - for u in url_list: + for u in urls: item: Dict[str, Any] = {"url": u} - if num_of_posts is not None: item["num_of_posts"] = num_of_posts - if posts_to_not_include: - item["posts_to_not_include"] = posts_to_not_include if start_date: item["start_date"] = start_date if end_date: item["end_date"] = end_date - if post_type: - item["post_type"] = post_type - payload.append(item) - if sdk_function is None: - sdk_function = get_caller_function_name() - - result = await self.workflow_executor.execute( + # Key difference: discover_by=url_all_reels + return await self._execute_discovery( payload=payload, - dataset_id=dataset_id, - poll_interval=DEFAULT_POLL_INTERVAL, - poll_timeout=timeout, - include_errors=True, - normalize_func=None, - sdk_function=sdk_function, + dataset_id=self.DATASET_ID_REELS, + discover_by="url_all_reels", + timeout=timeout, ) - if is_single and isinstance(result.data, list) and len(result.data) == 1: - result.url = url if isinstance(url, str) else url[0] - result.data = result.data[0] - return result - elif not is_single and isinstance(result.data, list): - from ...models import ScrapeResult - - results = [] - url_list = url if isinstance(url, list) else [url] - for url_item, data_item in zip(url_list, result.data): - results.append( - ScrapeResult( - success=True, - data=data_item, - url=url_item, - platform="instagram", - trigger_sent_at=result.trigger_sent_at, - snapshot_id_received_at=result.snapshot_id_received_at, - snapshot_polled_at=result.snapshot_polled_at, - data_fetched_at=result.data_fetched_at, - snapshot_id=result.snapshot_id, - cost=result.cost / len(result.data) if result.cost else None, - ) - ) - return results - return result + def reels_all_sync( + self, + url: Union[str, List[str]], + num_of_posts: Optional[int] = None, + start_date: Optional[str] = None, + end_date: Optional[str] = None, + timeout: int = DEFAULT_TIMEOUT_SHORT, + ) -> ScrapeResult: + """Synchronous version of reels_all().""" + + async def _run(): + async with self.engine: + return await self.reels_all(url, num_of_posts, start_date, end_date, timeout) + + return asyncio.run(_run()) diff --git a/src/brightdata/scrapers/workflow.py b/src/brightdata/scrapers/workflow.py index ab489d5..213c7e7 100644 --- a/src/brightdata/scrapers/workflow.py +++ b/src/brightdata/scrapers/workflow.py @@ -52,6 +52,7 @@ async def execute( include_errors: bool = True, normalize_func: Optional[Callable[[Any], Any]] = None, sdk_function: Optional[str] = None, + extra_params: Optional[Dict[str, str]] = None, ) -> ScrapeResult: """ Execute complete trigger/poll/fetch workflow. @@ -64,6 +65,8 @@ async def execute( include_errors: Include error records normalize_func: Optional function to normalize result data sdk_function: SDK function name for monitoring + extra_params: Additional query parameters for trigger (e.g., for discovery endpoints: + {"type": "discover_new", "discover_by": "user_name"}) Returns: ScrapeResult with data or error @@ -76,6 +79,7 @@ async def execute( dataset_id=dataset_id, include_errors=include_errors, sdk_function=sdk_function, + extra_params=extra_params, ) except APIError as e: return ScrapeResult( diff --git a/src/brightdata/utils/validation.py b/src/brightdata/utils/validation.py index 27e83aa..49d9494 100644 --- a/src/brightdata/utils/validation.py +++ b/src/brightdata/utils/validation.py @@ -154,3 +154,45 @@ def validate_http_method(method: str) -> None: valid_methods = ("GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS") if method.upper() not in valid_methods: raise ValidationError(f"Invalid HTTP method: {method}. Must be one of: {valid_methods}") + + +def validate_instagram_date(date: str) -> None: + """ + Validate Instagram date format (MM-DD-YYYY). + + Instagram API requires dates in MM-DD-YYYY format, not ISO format. + + Args: + date: Date string to validate. + + Raises: + ValidationError: If date format is invalid. + + Example: + >>> validate_instagram_date("01-15-2025") # Valid + >>> validate_instagram_date("2025-01-15") # Raises ValidationError + """ + if not date or not isinstance(date, str): + raise ValidationError("Date must be a non-empty string") + + # Check MM-DD-YYYY format + if not re.match(r"^\d{2}-\d{2}-\d{4}$", date): + raise ValidationError( + f"Invalid date format: {date}. Instagram requires MM-DD-YYYY format (e.g., '01-15-2025')" + ) + + # Validate actual date values + try: + month, day, year = date.split("-") + month_int = int(month) + day_int = int(day) + year_int = int(year) + + if not (1 <= month_int <= 12): + raise ValidationError(f"Invalid month in date: {date}. Month must be 01-12") + if not (1 <= day_int <= 31): + raise ValidationError(f"Invalid day in date: {date}. Day must be 01-31") + if not (1900 <= year_int <= 2100): + raise ValidationError(f"Invalid year in date: {date}. Year must be 1900-2100") + except ValueError: + raise ValidationError(f"Invalid date format: {date}. Instagram requires MM-DD-YYYY format") diff --git a/tests/unit/test_function_detection.py b/tests/unit/test_function_detection.py index 1d4e7a0..fcf1319 100644 --- a/tests/unit/test_function_detection.py +++ b/tests/unit/test_function_detection.py @@ -122,17 +122,23 @@ def test_facebook_scraper_methods_accept_sdk_function(self): sig = inspect.signature(scraper._scrape_urls) assert "sdk_function" in sig.parameters - def test_instagram_scraper_methods_accept_sdk_function(self): - """Test Instagram scraper methods can track sdk_function.""" + def test_instagram_scraper_methods_use_function_detection(self): + """Test Instagram scraper methods use function detection internally.""" from brightdata.scrapers.instagram import InstagramScraper import inspect scraper = InstagramScraper(bearer_token="test_token_123456789") - # Check if internal methods accept sdk_function parameter + # Instagram scraper's _scrape_urls calls get_caller_function_name() internally + # rather than accepting sdk_function as a parameter if hasattr(scraper, "_scrape_urls"): + # Verify the method exists and is callable + assert callable(scraper._scrape_urls) + # Check it has the expected parameters (url, dataset_id, timeout) sig = inspect.signature(scraper._scrape_urls) - assert "sdk_function" in sig.parameters + assert "url" in sig.parameters + assert "dataset_id" in sig.parameters + assert "timeout" in sig.parameters class TestSDKFunctionUsagePatterns: diff --git a/tests/unit/test_instagram.py b/tests/unit/test_instagram.py index b89ed9e..0f43e64 100644 --- a/tests/unit/test_instagram.py +++ b/tests/unit/test_instagram.py @@ -46,8 +46,8 @@ def test_profiles_method_signature(self): assert "url" in sig.parameters assert "timeout" in sig.parameters - # Defaults - assert sig.parameters["timeout"].default == 240 + # Defaults (180s = DEFAULT_TIMEOUT_SHORT, same as LinkedIn) + assert sig.parameters["timeout"].default == 180 def test_posts_method_signature(self): """Test posts method has correct signature.""" @@ -58,7 +58,7 @@ def test_posts_method_signature(self): assert "url" in sig.parameters assert "timeout" in sig.parameters - assert sig.parameters["timeout"].default == 240 + assert sig.parameters["timeout"].default == 180 def test_comments_method_signature(self): """Test comments method has correct signature.""" @@ -69,7 +69,7 @@ def test_comments_method_signature(self): assert "url" in sig.parameters assert "timeout" in sig.parameters - assert sig.parameters["timeout"].default == 240 + assert sig.parameters["timeout"].default == 180 def test_reels_method_signature(self): """Test reels method has correct signature.""" @@ -80,12 +80,19 @@ def test_reels_method_signature(self): assert "url" in sig.parameters assert "timeout" in sig.parameters - assert sig.parameters["timeout"].default == 240 + assert sig.parameters["timeout"].default == 180 class TestInstagramSearchScraper: """Test Instagram search scraper (parameter-based discovery).""" + def test_instagram_search_scraper_has_profiles_method(self): + """Test Instagram search scraper has profiles method for username discovery.""" + scraper = InstagramSearchScraper(bearer_token="test_token_123456789") + + assert hasattr(scraper, "profiles") + assert callable(scraper.profiles) + def test_instagram_search_scraper_has_posts_method(self): """Test Instagram search scraper has posts method (async-first API).""" scraper = InstagramSearchScraper(bearer_token="test_token_123456789") @@ -100,6 +107,25 @@ def test_instagram_search_scraper_has_reels_method(self): assert hasattr(scraper, "reels") assert callable(scraper.reels) + def test_instagram_search_scraper_has_reels_all_method(self): + """Test Instagram search scraper has reels_all method.""" + scraper = InstagramSearchScraper(bearer_token="test_token_123456789") + + assert hasattr(scraper, "reels_all") + assert callable(scraper.reels_all) + + def test_search_profiles_method_signature(self): + """Test search profiles method has correct signature.""" + import inspect + + scraper = InstagramSearchScraper(bearer_token="test_token_123456789") + sig = inspect.signature(scraper.profiles) + + # Required: user_name parameter (NOT url) + assert "user_name" in sig.parameters + assert "timeout" in sig.parameters + assert sig.parameters["timeout"].default == 180 + def test_search_posts_method_signature(self): """Test search posts method has correct signature.""" import inspect @@ -112,14 +138,14 @@ def test_search_posts_method_signature(self): # Optional filters assert "num_of_posts" in sig.parameters - assert "posts_to_not_include" in sig.parameters assert "start_date" in sig.parameters assert "end_date" in sig.parameters assert "post_type" in sig.parameters + assert "posts_to_not_include" in sig.parameters assert "timeout" in sig.parameters - # Defaults - assert sig.parameters["timeout"].default == 240 + # Defaults (180s = DEFAULT_TIMEOUT_SHORT) + assert sig.parameters["timeout"].default == 180 def test_search_reels_method_signature(self): """Test search reels method has correct signature.""" @@ -130,11 +156,24 @@ def test_search_reels_method_signature(self): assert "url" in sig.parameters assert "num_of_posts" in sig.parameters - assert "posts_to_not_include" in sig.parameters assert "start_date" in sig.parameters assert "end_date" in sig.parameters assert "timeout" in sig.parameters - assert sig.parameters["timeout"].default == 240 + assert sig.parameters["timeout"].default == 180 + + def test_search_reels_all_method_signature(self): + """Test search reels_all method has correct signature.""" + import inspect + + scraper = InstagramSearchScraper(bearer_token="test_token_123456789") + sig = inspect.signature(scraper.reels_all) + + assert "url" in sig.parameters + assert "num_of_posts" in sig.parameters + assert "start_date" in sig.parameters + assert "end_date" in sig.parameters + assert "timeout" in sig.parameters + assert sig.parameters["timeout"].default == 180 class TestInstagramDatasetIDs: @@ -145,14 +184,12 @@ def test_scraper_has_all_dataset_ids(self): scraper = InstagramScraper(bearer_token="test_token_123456789") assert scraper.DATASET_ID # Default: Profiles - assert scraper.DATASET_ID_PROFILES assert scraper.DATASET_ID_POSTS assert scraper.DATASET_ID_COMMENTS assert scraper.DATASET_ID_REELS # All should start with gd_ assert scraper.DATASET_ID.startswith("gd_") - assert scraper.DATASET_ID_PROFILES.startswith("gd_") assert scraper.DATASET_ID_POSTS.startswith("gd_") assert scraper.DATASET_ID_COMMENTS.startswith("gd_") assert scraper.DATASET_ID_REELS.startswith("gd_") @@ -161,11 +198,13 @@ def test_search_scraper_has_dataset_ids(self): """Test search scraper has dataset IDs.""" scraper = InstagramSearchScraper(bearer_token="test_token_123456789") - assert scraper.DATASET_ID_POSTS_DISCOVER - assert scraper.DATASET_ID_REELS_DISCOVER + assert scraper.DATASET_ID_PROFILES + assert scraper.DATASET_ID_POSTS + assert scraper.DATASET_ID_REELS - assert scraper.DATASET_ID_POSTS_DISCOVER.startswith("gd_") - assert scraper.DATASET_ID_REELS_DISCOVER.startswith("gd_") + assert scraper.DATASET_ID_PROFILES.startswith("gd_") + assert scraper.DATASET_ID_POSTS.startswith("gd_") + assert scraper.DATASET_ID_REELS.startswith("gd_") def test_scraper_has_platform_name(self): """Test scraper has correct platform name.""" @@ -237,8 +276,10 @@ def test_client_instagram_search_has_methods(self): """Test client.search.instagram has discovery methods.""" client = BrightDataClient(token="test_token_123456789") + assert hasattr(client.search.instagram, "profiles") assert hasattr(client.search.instagram, "posts") assert hasattr(client.search.instagram, "reels") + assert hasattr(client.search.instagram, "reels_all") def test_instagram_search_instance_from_client(self): """Test Instagram search instance is InstagramSearchScraper.""" @@ -329,3 +370,21 @@ def test_can_import_from_instagram_submodule(self): assert IG.__name__ == "InstagramScraper" assert IGSearch is not None assert IGSearch.__name__ == "InstagramSearchScraper" + + +class TestInstagramDiscoveryExtraParams: + """Test Instagram discovery uses extra_params correctly.""" + + def test_search_scraper_has_execute_discovery_method(self): + """Test search scraper has internal _execute_discovery method.""" + scraper = InstagramSearchScraper(bearer_token="test_token_123456789") + + assert hasattr(scraper, "_execute_discovery") + assert callable(scraper._execute_discovery) + + def test_search_scraper_has_context_manager(self): + """Test search scraper supports async context manager.""" + scraper = InstagramSearchScraper(bearer_token="test_token_123456789") + + assert hasattr(scraper, "__aenter__") + assert hasattr(scraper, "__aexit__")