diff --git a/.github/workflow/build.yml b/.github/workflow/build.yml new file mode 100644 index 0000000..0d26761 --- /dev/null +++ b/.github/workflow/build.yml @@ -0,0 +1,46 @@ +name: Build + +on: push + +jobs: + unittest: + name: Build wheel + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10"] + + steps: + # Checks out a copy of your repository on the ubuntu-latest machine + - name: Checkout code + uses: actions/checkout@v3 + + # Select correct version of Python + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + + # Install invoke + - name: Install setuptools, invoke and virtualenv + run: | + python -m pip install --upgrade pip setuptools virtualenv wheel build + python -m pip install invoke + + # Install the python package + - name: Install python package + run: | + invoke install + + # Build the Python package + - name: Build python package + run: | + invoke build + + # Archive production artifacts + - name: Archive built Python artifacts + uses: actions/upload-artifact@v3 + with: + name: dist + path: | + dist \ No newline at end of file diff --git a/.github/workflow/ci_cd.yml b/.github/workflow/ci_cd.yml new file mode 100644 index 0000000..435a105 --- /dev/null +++ b/.github/workflow/ci_cd.yml @@ -0,0 +1,43 @@ +name: Unittests and lint + +on: push + +jobs: + unittest: + name: Run the unit test and linter + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10"] + + steps: + # Checks out a copy of your repository on the ubuntu-latest machine + - name: Checkout code + uses: actions/checkout@v3 + + # Select correct version of Python + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + + # Install invoke + - name: Install setuptools, invoke and virtualenv + run: | + python -m pip install --upgrade pip setuptools virtualenv wheel + python -m pip install invoke + + # Install the python package + - name: Install python package + run: | + invoke install --extra test + + # Run the unit tests + - name: Run unit tests + run: | + invoke test --coverage + + # Run the linter + - name: Run linter + run: | + invoke lint \ No newline at end of file diff --git a/.github/workflow/publish.yml b/.github/workflow/publish.yml new file mode 100644 index 0000000..f0412ff --- /dev/null +++ b/.github/workflow/publish.yml @@ -0,0 +1,49 @@ +name: Publish documentation + +on: + push: + branches: + - documentation + - main + +permissions: + contents: write + id-token: write + pages: write + +# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. +# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. +concurrency: + group: "pages" + cancel-in-progress: false + +jobs: + build: + name: Build and publish documentation + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10"] + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + + - name: Install setuptools and invoke + run: | + python -m pip install --upgrade pip setuptools wheel + python -m pip install invoke + + - name: Build documentation + run: | + invoke build --docs + + - name: Publish documentation + uses: JamesIves/github-pages-deploy-action@v4 + with: + folder: docs diff --git a/_config.yml b/_config.yml new file mode 100644 index 0000000..d604b27 --- /dev/null +++ b/_config.yml @@ -0,0 +1,8 @@ +title: YouTrend +description: YouTube Trending Video +theme: jekyll-theme-minimal + +# Color Customization +minima: + color: "#FF5733" # Main color (replace with your desired red color code) + accent_color: "#FF5733" # Acc diff --git a/docs/app.md b/docs/app.md new file mode 100644 index 0000000..2318152 --- /dev/null +++ b/docs/app.md @@ -0,0 +1,25 @@ +Certainly! Let's enhance the documentation for your web application with Dash: + +--- + +# Web Application with Dash + +## Overview + +This web application built with Dash serves multiple purposes, catering to both users interested in obtaining statistics about trending YouTube videos and content creators. Below is a breakdown of the app's structure: + +### 1. Main Page + +In the "Main" page, users can explore the current ranking of the most popular videos based on the data collected at the moment. This section provides a real-time snapshot of the trending videos on YouTube. + +### 2. Descriptive Statistics Page + +The "Descriptive Statistics" page offers additional insights into the characteristics of trending videos. Users can access statistics related to themes, creators, and other relevant information. This section provides a more in-depth analysis to understand trends and patterns. + +### 3. Duration Model Page + +On the "Duration Model" page, users can leverage a predictive model to analyze the probability of a YouTube video reaching the trending section. The model considers various features such as video length, creator's subscriber count, and publication date to calculate the survival probability of a video. + +### 4. Diffusion Model Page + +The "Diffusion Model" page is designed for video creators. It enables them to generate thumbnails for their videos based on a text input, such as the video title. To access this feature, users need to provide a token generated on the Hugging Face website. The application provides a convenient way for creators to enhance their video presentation. \ No newline at end of file diff --git a/docs/duration_model.md b/docs/duration_model.md new file mode 100644 index 0000000..ffb1190 --- /dev/null +++ b/docs/duration_model.md @@ -0,0 +1,151 @@ +## Duration Model: Navigating YouTube Trending Probabilities + +Welcome to the Duration Model, where we embark on a journey through three main sections – **Process Data**, **Make Prediction**, and **Utility**. This module is designed to analyze the likelihood of a video entering the trending list and the probability of it not gaining traction. Let's explore each section step by step. + + + +### Make Prediction + +Welcome to the heart of the Duration Model – the **Make Prediction** section. This module provides robust functions for analyzing the probability of YouTube videos not reaching the trending section based on various features, including video length, creator's subscriber count, and publication date. + +#### Functions: + +##### `survival_probability(video_link, date, api_key, region_code, video_cat_enc)` + +Calculate the survival probability for a video. + +- **Parameters:** + - `video_link`: The link to the video. + - `date`: Date for calculating the survival probability. + - `api_key`: YouTube Data API key. + - `region_code`: Region code for fetching category labels. + - `video_cat_enc`: One-hot encoder for video categories. + +- **Returns:** Survival probability as a float. + +##### `plot_survival_probability(single_df, start_date, duration_days, gap, video_link, api_key, region_code, video_cat_enc)` + +Plot the survival probability over a specified duration for a given video. + +- **Parameters:** + - `single_df`: DataFrame containing details of a single video. + - `start_date`: Starting date for the survival probability calculation. + - `duration_days`: Duration for which survival probability is calculated. + - `gap`: Time gap between survival probability points. + - `video_link`: The link to the video. + - `api_key`: YouTube Data API key. + - `region_code`: Region code for fetching category labels. + - `video_cat_enc`: One-hot encoder for video categories. + +- **Returns:** X and Y coordinates for plotting the survival probability. + +#### How to Use the Prediction Module: + +1. **Survival Probability Calculation:** + - Use `survival_probability` function to calculate the survival probability for a specific video based on its features. + + ```python + prob = survival_probability( + video_link="your_video_link", + date="2023-01-01", + api_key="your_api_key", + region_code="US", + video_cat_enc=VIDEO_CAT_ENCODER + ) + ``` + +2. **Plotting Survival Probability:** + - Utilize `plot_survival_probability` function to visualize the survival probability over a specified duration for a given video. + + ```python + x, y = plot_survival_probability( + single_df=your_single_video_data_frame, + start_date="2023-01-01", + duration_days=30, + gap=1, + video_link="your_video_link", + api_key="your_api_key", + region_code="US", + video_cat_enc=VIDEO_CAT_ENCODER + ) + ``` + +Unlock the potential of predicting YouTube trending probabilities with precision using YouTrend's Duration Model. Leverage these functions to make informed decisions about your content strategy and optimize your chances of reaching the trending list! + +## Utility Functions + +Ladies and gentlemen, let's dive into the powerhouse of the Duration Model - the **Utility Module**. This module is the backbone, providing essential functions for extracting, processing, and analyzing YouTube video data. It's the wizard behind the scenes, making predictions about the survival probability of a video on the platform. + +### Main Functions: + +#### `get_video_details(video_link, api_key, region_code, video_cat_enc) -> pd.DataFrame` + + Fetches comprehensive details of a YouTube video using its link. + + - **Parameters:** + - `video_link`: The link to the YouTube video. + - `api_key`: Your YouTube Data API key. + - `region_code`: The region code for fetching category labels. Default is "US". + - `video_cat_enc`: Optional OneHotEncoder for video categories. + + - **Returns:** DataFrame containing rich information about the YouTube video. + +#### `get_video_id(video_link) -> str` + + Extracts the video ID from a YouTube video link. + + - **Parameters:** + - `video_link`: The YouTube video link. + + - **Returns:** The extracted video ID as a string. + +#### `preprocessing(filename, dataframe, on_loading, video_cat_enc) -> Tuple[pd.DataFrame, Optional[List[str]], Optional[OneHotEncoder]]` + + Processes the input data for the machine learning model. + + - **Parameters:** + - `filename`: Path to the CSV file containing the data. + - `dataframe`: DataFrame containing the data. + - `on_loading`: A boolean indicating whether the preprocessing is during loading. + - `video_cat_enc`: OneHotEncoder for later preprocessing before predictions. + + - **Returns:** + - `df`: Processed DataFrame. + - `model_features`: List of features for the duration model. + - `encoder`: OneHotEncoder for later preprocessing before predictions. + +#### `get_category_labels(api_key, region_code, youtube) -> Dict[str, str]` + + Retrieves YouTube video category labels. + + - **Parameters:** + - `api_key`: Your YouTube Data API key. + - `region_code`: The region code for fetching category labels. Default is 'US'. + - `youtube`: Optional. The YouTube API service object. + + - **Returns:** Dictionary mapping category IDs to category labels. + +#### `convert_duration_to_seconds(duration) -> int` + + Converts YouTube video duration from ISO 8601 format to seconds. + + - **Parameters:** + - `duration`: Duration string in ISO 8601 format. + + - **Returns:** Duration in seconds. + +#### 6. `get_channel_subscriber_count(api, channel_ids) -> Optional[int]` + + Retrieves subscriber count for YouTube channels. + + - **Parameters:** + - `api`: The YouTube API service object. + - `channel_ids`: List of YouTube channel IDs. + + - **Returns:** The subscriber count or None if an error occurs. + +### Global Constants: + +- `API_KEY (str)`: YouTube Data API key. + +And there you have it, the robust and versatile Utility Module, an indispensable part of our Duration Model. Let's give a round of applause for these functions that work tirelessly behind the scenes, making our predictions accurate and our analysis impeccable! 🚀✨ \ No newline at end of file diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..ee097ae --- /dev/null +++ b/docs/index.md @@ -0,0 +1,25 @@ +# YouTrend: Unleash the Power of YouTube Trending Analysis + +Welcome to YouTrend, your ultimate companion in deciphering the intricacies of YouTube's trending video landscape! Our comprehensive package offers a suite of robust features designed to elevate your understanding and interaction with trending content on the platform. + +## Features at a Glance + +### 1. [Scraper: Unveiling the Data Goldmine](./scraper.md) +YouTrend's Scraper module is your key to unlocking the vast treasure trove of data from the Google YouTube API. Seamlessly collect detailed information about trending videos, paving the way for a deeper understanding of the trends that captivate audiences. + +### 2. [Exploratory Data Analysis: Decoding Trends](./app.md) +Dive into the heart of trending videos with our Exploratory Data Analysis feature. Uncover hidden patterns, identify emerging themes, and visualize key insights through interactive graphics. Gain a profound understanding of what makes a video trend-worthy and stay ahead of the curve. + +### 3. [Text to Image: Visualizing Potential](./stable_diffusion.md) +Revolutionize the way you approach video promotion with our Text to Image feature. By analyzing video descriptions and titles, YouTrend suggests compelling images tailored to enhance the visual appeal of your content. Elevate your video thumbnails, increasing the likelihood of catching the eye of potential viewers. + +### 4. [Duration Model: Predicting Success](./duration_model.md) +Navigate the complex landscape of trending probabilities with our Duration Model. This sophisticated analysis module models the likelihood of a video entering the trending list and, conversely, the probability of it not gaining traction. Leverage this predictive power to fine-tune your content strategy for maximum impact. + +### 5. [App: Your YouTrend Dashboard](./app.md) +Explore the YouTrend application, your interactive dashboard for all things trending on YouTube. Engage with the features seamlessly, making data-driven decisions and optimizing your content strategy for success. + +## Elevate Your YouTube Experience +YouTrend isn't just a tool; it's your guide to mastering the art and science of YouTube trending analysis. Whether you're a content creator, marketer, or avid viewer, YouTrend empowers you to stay ahead of trends, create captivating content, and make informed decisions in the dynamic world of YouTube. + +Unleash the potential of your content with YouTrend – where analytics meets creativity! Join us on this exciting journey into the heart of YouTube trends. \ No newline at end of file diff --git a/docs/scraper.md b/docs/scraper.md new file mode 100644 index 0000000..ec46d6d --- /dev/null +++ b/docs/scraper.md @@ -0,0 +1,64 @@ +## Scraper: Unleashing the Power of YouTube Trending Data + +Welcome to the heart of YouTrend's data-gathering prowess – the Scraper module. This essential component empowers you to delve deep into the YouTube trending landscape by collecting and parsing data from the Google YouTube API. Harnessing the capabilities of Python, requests, and pandas, our Scraper module ensures you have access to a rich dataset, enabling you to derive valuable insights and make informed decisions. + +### Key Functions: + +#### `post_request_ytb(country_code_list, trending_type_dict)` +- Post an HTTP request to retrieve generic data about trending videos on YouTube. +- **Parameters:** + - `country_code_list`: List of countries from which to retrieve trending videos. + - `trending_type_dict`: Dictionary of different video categories to retrieve. +- **Returns:** Dictionary containing the response to the HTTP request. + +#### `collect_video_data(video_list, data_dictionary, trending_type, country_code)` +- Parse generic video data given an input list of videos previously collected. +- **Parameters:** + - `video_list`: List of videos retrieved using the `post_request_ytb` function. + - `data_dictionary`: Dictionary containing YouTube data to be updated in place. + - `trending_type`: Code corresponding to different categories in trending videos. + - `country_code`: Code corresponding to the country from which the videos are scraped. +- **Returns:** None. + +#### `collect_short_data(shorts_list, data_dictionary, country_code)` +- Parse generic short data given an input list of shorts previously collected. +- **Parameters:** + - `shorts_list`: List of shorts retrieved using the `post_request_ytb` function. + - `data_dictionary`: Dictionary containing YouTube data to be updated in place. + - `country_code`: Code corresponding to the country from which the videos are scraped. +- **Returns:** None. + +#### `add_now_videos_shorts(response_dict, data_dictionary)` +- Update the `data_dictionary` dictionary with trending videos and shorts from YouTube. +- **Parameters:** + - `response_dict`: Dictionary containing the response to the HTTP post request. + - `data_dictionary`: Dictionary containing YouTube data to be updated in place. +- **Returns:** None. + +#### `add_other_sections(response_dict, data_dictionary)` +- Update the `data_dictionary` dictionary with trending videos from categories other than "Now." +- **Parameters:** + - `response_dict`: Dictionary containing the response to the HTTP post request. + - `data_dictionary`: Dictionary containing YouTube data to be updated in place. +- **Returns:** None. + +#### `update_video_data(data_dictionary)` +- Update `data_dictionary` in place with information on individual videos. +- **Parameters:** + - `data_dictionary`: Dictionary containing YouTube data to be updated in place. +- **Returns:** None. + +#### `update_meta_data(data_dictionary)` +- Update `data_dictionary` in place with metadata information on individual videos. +- **Parameters:** + - `data_dictionary`: Dictionary containing YouTube data to be updated in place. +- **Returns:** None. + +### How to Run the Scraper Script: + +1. Create an empty `data_dictionary` to be filled. +2. Fill it with YouTube ID information and other basic information using `post_request_ytb`, `add_now_videos_shorts`, and `add_other_sections` functions. +3. Update the `data_dictionary` with individual data by calling `update_video_data` and `update_meta_data` functions. +4. Create the corresponding `.csv` file within the `/data` directory. + +Unleash the power of data-driven decision-making with YouTrend's Scraper module. Dive into the world of YouTube trending with confidence! \ No newline at end of file diff --git a/docs/stable_diffusion.md b/docs/stable_diffusion.md new file mode 100644 index 0000000..e69de29 diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..da5eb6a --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,40 @@ +site_name: YouTrend +site_url: https://example.com/ +nav: + - Home: index.md + - App: app.md + - Duration model: duration_model.md + - Scraper: scraper.md + - Stable Diffusion: sable_diffusion.md + +theme: + name: material + features: + - content.code.annotate + palette: + # Palette toggle for light mode + - scheme: default + toggle: + icon: material/brightness-7 + name: Switch to dark mode + + # Palette toggle for dark mode + - scheme: slate + toggle: + icon: material/brightness-4 + name: Switch to light mode + +plugins: + - search + - section-index + +markdown_extensions: + - admonition + - pymdownx.highlight: + anchor_linenums: true + - pymdownx.inlinehilite + - pymdownx.snippets + - pymdownx.superfences + - pymdownx.magiclink + - pymdownx.details + - attr_list \ No newline at end of file diff --git a/tasks.py b/tasks.py new file mode 100644 index 0000000..ea48e8b --- /dev/null +++ b/tasks.py @@ -0,0 +1,169 @@ +"""Module defining a set of userfule tasks for CI/CD. +""" +from invoke import task +import platform + +"""Module defining a set of userfule tasks for CI/CD. +""" + +SRC_FOLDER = "src" + + +def get_activate_venv(): + """Get the venv activation command depending on the OS. + """ + if platform.system() == "Windows": + return ".\.env\Scripts\\activate" + else: + return "source .env/bin/activate" + + +@task +def install_package(c, + extra="", + proxy="", + editable=False, + build_isolation=False): + """Install the Python package. + + Args: + extra (str, optional): The extra packages to install. + Defaults to not installing any. If you want to install CIP package, + set it to cip. + proxy (str, optional): The proxy to use for installation. + Defaults to no proxy. + editable (bool, optional): Whether or not the package should be installed in development + mode. Defaults to False. + build_isolation (bool, optional): Whether or not the installation should be isolated from the + rest of the environment. Defaults to False. + """ + install_extra = f"[{extra}]" if extra else "" + install_through_proxies = f" --proxy {proxy}" if proxy else "" + build_isolation_cmd = " --no-build-isolation" if not build_isolation else "" + install_package = "pip install -e ." if editable else "pip install ." + # Upgrade setuptools + c.run("pip install --upgrade setuptools" + install_through_proxies) + # Install package + c.run(install_package + install_extra + install_through_proxies + build_isolation_cmd) + + +@task +def install(c, + extra="", + proxy="", + editable=False, + build_isolation=False, + venv=False): + """Install the Python package, optionally in a venv. + + Args: + extra (str, optional): The extra packages to install. + Defaults to not installing any. If you want to install CIP package, + set it to cip. + proxy (str, optional): The proxy to use for installation. + Defaults to no proxy. + editable (bool, optional): Whether or not the package should be installed in development + mode. Defaults to False. + build_isolation (bool, optional): Whether or not the installation should be isolated from the + rest of the environment. Defaults to False. + venv (bool, optional): Whether or not the install should be done in a virtualenv, with a + name .env. Defaults to False. + """ + if venv: + venv_cmd = "python -m venv .env" + c.run(venv_cmd) + activate_cmd = get_activate_venv() + with c.prefix(activate_cmd): + install_package(c, + extra=extra, + proxy=proxy, + editable=editable, + build_isolation=build_isolation) + else: + install_package(c, + extra=extra, + proxy=proxy, + editable=editable, + build_isolation=build_isolation) + + +@task +def test(c, coverage=True, venv=False, report=""): + """Run the unit tests of the package. The package should have been installed in cip mode. + + Args: + coverage (bool, optional): Whether or not to output the coverage. + Defaults to True. + venv (bool, optional): Whether or not to run the tests in a venv. You need to have + installed the package in venv mode beforehand. + Defaults to True. + report (str, optional): The path to the coverage report, if coverage is set to True. + Defaults to an empty path. + """ + cov = f"--cov={SRC_FOLDER}" if coverage else "" + report_cmd = f" --cov-report {report} " if report else "" + test_cmd = f"pytest {cov}{report_cmd} --cov-report term-missing {SRC_FOLDER}/tests" + if venv: + activate_cmd = get_activate_venv() + with c.prefix(activate_cmd): + c.run(test_cmd) + else: + c.run(test_cmd) + + +@task +def lint(c, rc_file="", output_file="", venv=False): + """Lint the package using pylint. + The package should be installed in cip mode. + + Args: + rc_file (str, optional): The configuration to use for linting. + Defaults to the pylint standard configuration. + output_file (str, optional): The path to the output of the pylint command. + Defaults to no file. + venv (bool, optional): Whether or not the command should be run in the venv. + Defaults to True. The package needs to have been installed in a venv beforehand. + """ + rc_cmd = f" --rcfile={rc_file}" if rc_file else "" + output_file_cmd = f" > {output_file}" if output_file else "" + lint_cmd = "pylint --recursive=y --exit-zero --output-format=parseable --reports=no "\ + f"{SRC_FOLDER} {rc_cmd} {output_file_cmd}" + if venv: + activate_cmd = get_activate_venv() + with c.prefix(activate_cmd): + c.run(lint_cmd) + else: + c.run(lint_cmd) + + +@task +def build(c, + proxy="", + outdir=""): + """Bundle the Python code into a wheel. + + Args: + proxy (str, optional): The proxy to use for installation. + Defaults to not using any proxy. + outdir (str, optional): The path to the folder to store the wheel. + Defaults to dist. + """ + outdir_cmd = f" --outdir {outdir}" if outdir else "" + install_through_proxies = f" --proxy {proxy}" if proxy else "" + # Create Python venv and activate + c.run("python -m venv .env") + activate_cmd = get_activate_venv() + with c.prefix(activate_cmd): + c.run("pip install --upgrade build pip setuptools" + install_through_proxies) + build_cmd = "python -m build --no-isolation" + outdir_cmd + c.run(build_cmd) + + +@task +def build(c, output_dir="", docs=False): + """Build the package wheel. + """ + output_dir_cmd = f" --outdir {output_dir}" if output_dir else "" + c.run(f"python -m build{output_dir_cmd}") + if docs: + c.run("python -m mkdocs build -d documentation") \ No newline at end of file