Module top_github_scraper.scrape_repo

Expand source code
import json
import os
from pathlib import Path
from typing import List

import pandas as pd
import requests
from dotenv import load_dotenv
from rich import print
from rich.progress import track
from tqdm import tqdm
from top_github_scraper.utils import ScrapeGithubUrl, UserProfileGetter, isnotebook
import logging

load_dotenv()

USERNAME = os.getenv("GITHUB_USERNAME")
TOKEN = os.getenv("GITHUB_TOKEN")

class RepoScraper:
    """Scrape information of repos and the
    contributors of those repositories"""

    def __init__(self, repo_urls: list, max_n_top_contributors: int):
        self.repo_urls = repo_urls
        self.max_n_top_contributors = max_n_top_contributors

    def get_all_top_repo_information(self):
        top_repo_infos = []

        if isnotebook():
            for repo_url in tqdm(
            self.repo_urls, desc="Scraping top GitHub repositories..."
            ):
                top_repo_infos.append(self._get_repo_information(repo_url))
        else:
            for repo_url in track(
                self.repo_urls, description="Scraping top GitHub repositories..."
            ):
                top_repo_infos.append(self._get_repo_information(repo_url))

        return top_repo_infos

    def _get_repo_information(self, repo_url: str):
        repo_info_url = f"https://api.github.com/repos{repo_url}"
        repo_info = requests.get(repo_info_url, auth=(USERNAME, TOKEN)).json()
        info_to_scrape = ["stargazers_count", "forks_count"]
        repo_important_info = {}
        for info in info_to_scrape:
            repo_important_info[info] = repo_info[info]

        repo_important_info[
            "contributors"
        ] = self._get_contributor_repo_of_one_repo(repo_url)

        return repo_important_info

    def _get_contributor_repo_of_one_repo(self, repo_url: str):

        # https://api.github.com/repos/josephmisiti/awesome-machine-learning/contributors
        contributor_url = (
            f"https://api.github.com/repos{repo_url}/contributors"
        )
        contributor_page = requests.get(
            contributor_url, auth=(USERNAME, TOKEN)
        ).json()

        contributors_info = {"login": [], "url": [], "contributions": []}

        max_n_top_contributors = self._find_max_n_top_contributors(
            num_contributors=len(contributor_page)
        )
        n_top_contributor = 0

        while n_top_contributor < max_n_top_contributors:
            contributor = contributor_page[n_top_contributor]

            self._get_contributor_general_info(contributors_info, contributor)
            n_top_contributor += 1

        return contributors_info

    @staticmethod
    def _get_contributor_general_info(
        contributors_info: List[dict], contributor: dict
    ):

        contributors_info["login"].append(contributor["login"])
        contributors_info["url"].append(contributor["url"])
        contributors_info["contributions"].append(contributor["contributions"])

    def _find_max_n_top_contributors(self, num_contributors: int):
        if num_contributors > self.max_n_top_contributors:
            return self.max_n_top_contributors
        else:
            return num_contributors


class DataProcessor:
    def __init__(self, data: list):
        self.data = data

    def process(self) -> pd.DataFrame:

        repos = [self.process_one_repo(repo) for repo in self.data]
        return pd.concat(repos).reset_index(drop=True)

    def process_one_repo(self, repo_info: dict):
        contributors_info = repo_info["contributors"]
        contributors_info = pd.DataFrame(contributors_info)

        repo_stats = self.get_repo_stats(repo_info)

        for col_name, val in repo_stats.items():
            contributors_info[col_name] = val

        return contributors_info

    @staticmethod
    def get_repo_stats(repo_info: dict):
        repo_stats_list = [
            "stargazers_count",
            "forks_count",
            "created_at",
            "updated_at",
        ]
        return {
            key: val
            for key, val in repo_info.items()
            if key in repo_stats_list
        }


def get_top_repo_urls(
    keyword: str,
    sort_by: str='best_match', 
    save_directory: str=".",
    start_page: int = 1,
    stop_page: int = 10,
):
    """Get the URLs of the repositories pop up when searching for a specific
    keyword on GitHub.

    Parameters
    ----------
    keyword : str
        Keyword to search for (.i.e, machine learning)
    sort_by: str 
        sort by best match or most stars, by default 'best_match', which will sort by best match. 
        Use 'stars' to sort by most stars.
    save_directory: str, optional 
        directory to save the output file, by default "."
    start_page : int, optional
        page number to start scraping from, by default 1
    stop_page : int, optional
        page number of the last page to scrape, by default 10
    """
    try: 
        Path(save_directory).mkdir(parents=True, exist_ok=True)
        full_path = f'{save_directory}/top_repo_urls_{keyword}_{sort_by}_{start_page}_{stop_page}.json'
        repo_urls = ScrapeGithubUrl(
            keyword, 'Repositories', sort_by, start_page, stop_page
        ).scrape_top_repo_url_multiple_pages()

        with open(full_path, "w") as outfile:
            json.dump(repo_urls, outfile)
        return repo_urls
    except Exception as e:
        print(e)
        logging.error("""You might ran out of rate limit. Are you an authenticated user? If you ran out of rate limit while requesting as an authenticated user, 
        either decrease the number of pages to scrape or to wait until more requests are available.""")


def get_top_repos(
    keyword: int,
    sort_by: str='best_match',
    save_directory: str=".",
    max_n_top_contributors: int = 10,
    start_page: int = 1,
    stop_page: int = 10,
):
    """Get the information of the repositories pop up when searching for a specific
    keyword on GitHub.

    Parameters
    ----------
    keyword : str
        Keyword to search for (.i.e, machine learning)
    sort_by: str 
        sort by best match or most stars, by default 'best_match', which will sort by best match. 
        Use 'stars' to sort by most stars.
    max_n_top_contributors: int
        number of top contributors in each repository to scrape from, by default 10
    start_page : int, optional
        page number to start scraping from, by default 1
    stop_page : int, optional
        page number of the last page to scrape, by default 10
    save_directory: str, optional 
        directory to save the output file, by default "."
    """
    try:
        full_url_save_path = (
            f"{save_directory}/top_repo_urls_{keyword}_{sort_by}_{start_page}_{stop_page}.json"
        )
        repo_save_path = f"{save_directory}/top_repo_info_{keyword}_{sort_by}_{start_page}_{stop_page}.json"

        if not Path(full_url_save_path).exists():
            get_top_repo_urls(keyword=keyword, sort_by=sort_by, save_directory=save_directory, start_page=start_page, stop_page=stop_page)
        with open(full_url_save_path, "r") as infile:
            repo_urls = json.load(infile)
            top_repos = RepoScraper(
                repo_urls, max_n_top_contributors
            ).get_all_top_repo_information()

        with open(repo_save_path, "w") as outfile:
            json.dump(top_repos, outfile)
        return top_repos

    except Exception as e:  
        print(e)
        logging.error("""You might ran out of rate limit. Are you an authenticated user? If you ran out of rate limit while requesting as an authenticated user, 
        either decrease the number of pages to scrape or to wait until more requests are available.""")

def get_top_contributors(
    keyword: int,
    sort_by: str='best_match', 
    max_n_top_contributors: int = 10,
    start_page: int = 1,
    stop_page: int = 10,
    get_user_info_only: bool=True, 
    save_directory: str=".",
):
    """
    Get the information of the owners and contributors of the repositories pop up when searching for a specific
    keyword on GitHub.
    Parameters
    ----------
    keyword : str
        Keyword to search for (.i.e, machine learning)
    sort_by: str 
        sort by best match or most stars, by default 'best_match', which will sort by best match. 
        Use 'stars' to sort by most stars.
    max_n_top_contributors: int
        number of top contributors in each repository to scrape from, by default 10
    start_page : int, optional
        page number to start scraping from, by default 1
    stop_page : int, optional
        page number of the last page to scrape, by default 10
    get_user_info_only: bool, optional
        whether to get the information of only contributors or to get the information of both contributors 
        and repositories contributors were scraped from, by default True, which means to get only contributors' information
    save_directory: str, optional 
        directory to save the output file, by default "."
    url_save_path : str, optional
        where to save the output file of URLs, by default "top_repo_urls"
    repo_save_path : str, optional
        where to save the output file of repositories' information, by default "top_repo_info"
    user_save_path : str, optional
        where to save the output file of users' profiles, by default "top_contributor_info"
    """

    full_repo_save_path = (
        f"{save_directory}/top_repo_info_{keyword}_{sort_by}_{start_page}_{stop_page}.json"
    )
    user_save_path = f"{save_directory}/top_contributor_info_{keyword}_{sort_by}_{start_page}_{stop_page}.csv"
    if not Path(full_repo_save_path).exists():
        get_top_repos(
            keyword=keyword,
            sort_by=sort_by,
            max_n_top_contributors=max_n_top_contributors,
            start_page=start_page,
            stop_page=stop_page,
            save_directory=save_directory
        )
    with open(full_repo_save_path, "r") as infile:
        repo_info = json.load(infile)
        repo_info = DataProcessor(repo_info).process()
        urls = repo_info['url']
        top_users = UserProfileGetter(urls).get_all_user_profiles()
        if get_user_info_only:
            top_users.to_csv(user_save_path)
            return top_users
        else:
            repo_and_top_users = pd.concat([repo_info, top_users], axis=1)
            repo_and_top_users = repo_and_top_users.loc[:,~repo_and_top_users.columns.duplicated()]
            repo_and_top_users.to_csv(user_save_path)
            return repo_and_top_users

Functions

def get_top_contributors(keyword: int, sort_by: str = 'best_match', max_n_top_contributors: int = 10, start_page: int = 1, stop_page: int = 10, get_user_info_only: bool = True, save_directory: str = '.')

Get the information of the owners and contributors of the repositories pop up when searching for a specific keyword on GitHub. Parameters


keyword : str
Keyword to search for (.i.e, machine learning)
sort_by : str
sort by best match or most stars, by default 'best_match', which will sort by best match. Use 'stars' to sort by most stars.
max_n_top_contributors : int
number of top contributors in each repository to scrape from, by default 10
start_page : int, optional
page number to start scraping from, by default 1
stop_page : int, optional
page number of the last page to scrape, by default 10
get_user_info_only : bool, optional
whether to get the information of only contributors or to get the information of both contributors and repositories contributors were scraped from, by default True, which means to get only contributors' information
save_directory : str, optional
directory to save the output file, by default "."
url_save_path : str, optional
where to save the output file of URLs, by default "top_repo_urls"
repo_save_path : str, optional
where to save the output file of repositories' information, by default "top_repo_info"
user_save_path : str, optional
where to save the output file of users' profiles, by default "top_contributor_info"
Expand source code
def get_top_contributors(
    keyword: int,
    sort_by: str='best_match', 
    max_n_top_contributors: int = 10,
    start_page: int = 1,
    stop_page: int = 10,
    get_user_info_only: bool=True, 
    save_directory: str=".",
):
    """
    Get the information of the owners and contributors of the repositories pop up when searching for a specific
    keyword on GitHub.
    Parameters
    ----------
    keyword : str
        Keyword to search for (.i.e, machine learning)
    sort_by: str 
        sort by best match or most stars, by default 'best_match', which will sort by best match. 
        Use 'stars' to sort by most stars.
    max_n_top_contributors: int
        number of top contributors in each repository to scrape from, by default 10
    start_page : int, optional
        page number to start scraping from, by default 1
    stop_page : int, optional
        page number of the last page to scrape, by default 10
    get_user_info_only: bool, optional
        whether to get the information of only contributors or to get the information of both contributors 
        and repositories contributors were scraped from, by default True, which means to get only contributors' information
    save_directory: str, optional 
        directory to save the output file, by default "."
    url_save_path : str, optional
        where to save the output file of URLs, by default "top_repo_urls"
    repo_save_path : str, optional
        where to save the output file of repositories' information, by default "top_repo_info"
    user_save_path : str, optional
        where to save the output file of users' profiles, by default "top_contributor_info"
    """

    full_repo_save_path = (
        f"{save_directory}/top_repo_info_{keyword}_{sort_by}_{start_page}_{stop_page}.json"
    )
    user_save_path = f"{save_directory}/top_contributor_info_{keyword}_{sort_by}_{start_page}_{stop_page}.csv"
    if not Path(full_repo_save_path).exists():
        get_top_repos(
            keyword=keyword,
            sort_by=sort_by,
            max_n_top_contributors=max_n_top_contributors,
            start_page=start_page,
            stop_page=stop_page,
            save_directory=save_directory
        )
    with open(full_repo_save_path, "r") as infile:
        repo_info = json.load(infile)
        repo_info = DataProcessor(repo_info).process()
        urls = repo_info['url']
        top_users = UserProfileGetter(urls).get_all_user_profiles()
        if get_user_info_only:
            top_users.to_csv(user_save_path)
            return top_users
        else:
            repo_and_top_users = pd.concat([repo_info, top_users], axis=1)
            repo_and_top_users = repo_and_top_users.loc[:,~repo_and_top_users.columns.duplicated()]
            repo_and_top_users.to_csv(user_save_path)
            return repo_and_top_users
def get_top_repo_urls(keyword: str, sort_by: str = 'best_match', save_directory: str = '.', start_page: int = 1, stop_page: int = 10)

Get the URLs of the repositories pop up when searching for a specific keyword on GitHub.

Parameters

keyword : str
Keyword to search for (.i.e, machine learning)
sort_by : str
sort by best match or most stars, by default 'best_match', which will sort by best match. Use 'stars' to sort by most stars.
save_directory : str, optional
directory to save the output file, by default "."
start_page : int, optional
page number to start scraping from, by default 1
stop_page : int, optional
page number of the last page to scrape, by default 10
Expand source code
def get_top_repo_urls(
    keyword: str,
    sort_by: str='best_match', 
    save_directory: str=".",
    start_page: int = 1,
    stop_page: int = 10,
):
    """Get the URLs of the repositories pop up when searching for a specific
    keyword on GitHub.

    Parameters
    ----------
    keyword : str
        Keyword to search for (.i.e, machine learning)
    sort_by: str 
        sort by best match or most stars, by default 'best_match', which will sort by best match. 
        Use 'stars' to sort by most stars.
    save_directory: str, optional 
        directory to save the output file, by default "."
    start_page : int, optional
        page number to start scraping from, by default 1
    stop_page : int, optional
        page number of the last page to scrape, by default 10
    """
    try: 
        Path(save_directory).mkdir(parents=True, exist_ok=True)
        full_path = f'{save_directory}/top_repo_urls_{keyword}_{sort_by}_{start_page}_{stop_page}.json'
        repo_urls = ScrapeGithubUrl(
            keyword, 'Repositories', sort_by, start_page, stop_page
        ).scrape_top_repo_url_multiple_pages()

        with open(full_path, "w") as outfile:
            json.dump(repo_urls, outfile)
        return repo_urls
    except Exception as e:
        print(e)
        logging.error("""You might ran out of rate limit. Are you an authenticated user? If you ran out of rate limit while requesting as an authenticated user, 
        either decrease the number of pages to scrape or to wait until more requests are available.""")
def get_top_repos(keyword: int, sort_by: str = 'best_match', save_directory: str = '.', max_n_top_contributors: int = 10, start_page: int = 1, stop_page: int = 10)

Get the information of the repositories pop up when searching for a specific keyword on GitHub.

Parameters

keyword : str
Keyword to search for (.i.e, machine learning)
sort_by : str
sort by best match or most stars, by default 'best_match', which will sort by best match. Use 'stars' to sort by most stars.
max_n_top_contributors : int
number of top contributors in each repository to scrape from, by default 10
start_page : int, optional
page number to start scraping from, by default 1
stop_page : int, optional
page number of the last page to scrape, by default 10
save_directory : str, optional
directory to save the output file, by default "."
Expand source code
def get_top_repos(
    keyword: int,
    sort_by: str='best_match',
    save_directory: str=".",
    max_n_top_contributors: int = 10,
    start_page: int = 1,
    stop_page: int = 10,
):
    """Get the information of the repositories pop up when searching for a specific
    keyword on GitHub.

    Parameters
    ----------
    keyword : str
        Keyword to search for (.i.e, machine learning)
    sort_by: str 
        sort by best match or most stars, by default 'best_match', which will sort by best match. 
        Use 'stars' to sort by most stars.
    max_n_top_contributors: int
        number of top contributors in each repository to scrape from, by default 10
    start_page : int, optional
        page number to start scraping from, by default 1
    stop_page : int, optional
        page number of the last page to scrape, by default 10
    save_directory: str, optional 
        directory to save the output file, by default "."
    """
    try:
        full_url_save_path = (
            f"{save_directory}/top_repo_urls_{keyword}_{sort_by}_{start_page}_{stop_page}.json"
        )
        repo_save_path = f"{save_directory}/top_repo_info_{keyword}_{sort_by}_{start_page}_{stop_page}.json"

        if not Path(full_url_save_path).exists():
            get_top_repo_urls(keyword=keyword, sort_by=sort_by, save_directory=save_directory, start_page=start_page, stop_page=stop_page)
        with open(full_url_save_path, "r") as infile:
            repo_urls = json.load(infile)
            top_repos = RepoScraper(
                repo_urls, max_n_top_contributors
            ).get_all_top_repo_information()

        with open(repo_save_path, "w") as outfile:
            json.dump(top_repos, outfile)
        return top_repos

    except Exception as e:  
        print(e)
        logging.error("""You might ran out of rate limit. Are you an authenticated user? If you ran out of rate limit while requesting as an authenticated user, 
        either decrease the number of pages to scrape or to wait until more requests are available.""")

Classes

class DataProcessor (data: list)
Expand source code
class DataProcessor:
    def __init__(self, data: list):
        self.data = data

    def process(self) -> pd.DataFrame:

        repos = [self.process_one_repo(repo) for repo in self.data]
        return pd.concat(repos).reset_index(drop=True)

    def process_one_repo(self, repo_info: dict):
        contributors_info = repo_info["contributors"]
        contributors_info = pd.DataFrame(contributors_info)

        repo_stats = self.get_repo_stats(repo_info)

        for col_name, val in repo_stats.items():
            contributors_info[col_name] = val

        return contributors_info

    @staticmethod
    def get_repo_stats(repo_info: dict):
        repo_stats_list = [
            "stargazers_count",
            "forks_count",
            "created_at",
            "updated_at",
        ]
        return {
            key: val
            for key, val in repo_info.items()
            if key in repo_stats_list
        }

Static methods

def get_repo_stats(repo_info: dict)
Expand source code
@staticmethod
def get_repo_stats(repo_info: dict):
    repo_stats_list = [
        "stargazers_count",
        "forks_count",
        "created_at",
        "updated_at",
    ]
    return {
        key: val
        for key, val in repo_info.items()
        if key in repo_stats_list
    }

Methods

def process(self) ‑> pandas.core.frame.DataFrame
Expand source code
def process(self) -> pd.DataFrame:

    repos = [self.process_one_repo(repo) for repo in self.data]
    return pd.concat(repos).reset_index(drop=True)
def process_one_repo(self, repo_info: dict)
Expand source code
def process_one_repo(self, repo_info: dict):
    contributors_info = repo_info["contributors"]
    contributors_info = pd.DataFrame(contributors_info)

    repo_stats = self.get_repo_stats(repo_info)

    for col_name, val in repo_stats.items():
        contributors_info[col_name] = val

    return contributors_info
class RepoScraper (repo_urls: list, max_n_top_contributors: int)

Scrape information of repos and the contributors of those repositories

Expand source code
class RepoScraper:
    """Scrape information of repos and the
    contributors of those repositories"""

    def __init__(self, repo_urls: list, max_n_top_contributors: int):
        self.repo_urls = repo_urls
        self.max_n_top_contributors = max_n_top_contributors

    def get_all_top_repo_information(self):
        top_repo_infos = []

        if isnotebook():
            for repo_url in tqdm(
            self.repo_urls, desc="Scraping top GitHub repositories..."
            ):
                top_repo_infos.append(self._get_repo_information(repo_url))
        else:
            for repo_url in track(
                self.repo_urls, description="Scraping top GitHub repositories..."
            ):
                top_repo_infos.append(self._get_repo_information(repo_url))

        return top_repo_infos

    def _get_repo_information(self, repo_url: str):
        repo_info_url = f"https://api.github.com/repos{repo_url}"
        repo_info = requests.get(repo_info_url, auth=(USERNAME, TOKEN)).json()
        info_to_scrape = ["stargazers_count", "forks_count"]
        repo_important_info = {}
        for info in info_to_scrape:
            repo_important_info[info] = repo_info[info]

        repo_important_info[
            "contributors"
        ] = self._get_contributor_repo_of_one_repo(repo_url)

        return repo_important_info

    def _get_contributor_repo_of_one_repo(self, repo_url: str):

        # https://api.github.com/repos/josephmisiti/awesome-machine-learning/contributors
        contributor_url = (
            f"https://api.github.com/repos{repo_url}/contributors"
        )
        contributor_page = requests.get(
            contributor_url, auth=(USERNAME, TOKEN)
        ).json()

        contributors_info = {"login": [], "url": [], "contributions": []}

        max_n_top_contributors = self._find_max_n_top_contributors(
            num_contributors=len(contributor_page)
        )
        n_top_contributor = 0

        while n_top_contributor < max_n_top_contributors:
            contributor = contributor_page[n_top_contributor]

            self._get_contributor_general_info(contributors_info, contributor)
            n_top_contributor += 1

        return contributors_info

    @staticmethod
    def _get_contributor_general_info(
        contributors_info: List[dict], contributor: dict
    ):

        contributors_info["login"].append(contributor["login"])
        contributors_info["url"].append(contributor["url"])
        contributors_info["contributions"].append(contributor["contributions"])

    def _find_max_n_top_contributors(self, num_contributors: int):
        if num_contributors > self.max_n_top_contributors:
            return self.max_n_top_contributors
        else:
            return num_contributors

Methods

def get_all_top_repo_information(self)
Expand source code
def get_all_top_repo_information(self):
    top_repo_infos = []

    if isnotebook():
        for repo_url in tqdm(
        self.repo_urls, desc="Scraping top GitHub repositories..."
        ):
            top_repo_infos.append(self._get_repo_information(repo_url))
    else:
        for repo_url in track(
            self.repo_urls, description="Scraping top GitHub repositories..."
        ):
            top_repo_infos.append(self._get_repo_information(repo_url))

    return top_repo_infos