Module top_github_scraper.utils

Expand source code
from dataclasses import dataclass
from bs4 import BeautifulSoup
import requests
from rich.progress import track
from rich import print 
import pandas as pd 
import os
import warnings
from dotenv import load_dotenv
from typing import List
from IPython import get_ipython
from tqdm import tqdm
import logging


load_dotenv()
warnings.filterwarnings("ignore")

TYPES = ['Users', 'Repositories', 'Code', 'Commits', 'Issues', 'Packages', 'Topics']
SORT_BY = {'Users': ['followers'],
            'Repositories': ['', 'stars']}
SCRAPE_CLASS = {'Users': 'mr-1', 'Repositories': "v-align-middle"}

USERNAME = os.getenv("GITHUB_USERNAME")
TOKEN = os.getenv("GITHUB_TOKEN")

if USERNAME is None or TOKEN is None:
    logging.warning("""You are using Github API as an unauthenticated user. For unauthenticated requests, the rate limit allows for up to 60 requests per hour.
     Follow the instruction here to be authenticated and increase your rate limit: https://github.com/khuyentran1401/top-github-scraper#setup""")
class ScrapeGithubUrl:
    """Scrape top Github urls based on a certain keyword and type

    Parameters
    -------
    keyword: str
        keyword to search on Github
    type: str
        whether to search for User or Repositories
    sort_by: str 
        sort by best match or most stars, by default 'best_match', which will sort by best match. 
        Use 'stars' to sort by most stars.
    start_page_num: int
        page number to start scraping. The default is 0
    stop_page_num: int
        page number to stop scraping

    Returns
    -------
    List[str]
    """

    def __init__(self,keyword: str, type: str, sort_by: str, start_page_num: int, stop_page_num: int):
        self.keyword = keyword
        self.type = type
        self.start_page_num = start_page_num
        self.stop_page_num = stop_page_num
        if sort_by =='best_match':
            self.sort_by = ''
        else:
            self.sort_by = sort_by

    @staticmethod
    def _keyword_to_url(page_num: int, keyword: str, type: str, sort_by: str):
        """Change keyword to a url"""
        keyword_no_space = ("+").join(keyword.split(" "))
        return f"https://github.com/search?o=desc&p={str(page_num)}&q={keyword_no_space}&s={sort_by}&type={type}"

    def _scrape_top_repo_url_one_page(self, page_num: int):
        """Scrape urls of top Github repositories in 1 page"""
        url = self._keyword_to_url(page_num, self.keyword, type=self.type, sort_by=self.sort_by)
        page = requests.get(url)

        soup = BeautifulSoup(page.text, "html.parser")
        a_tags = soup.find_all("a", class_=SCRAPE_CLASS[self.type])
        urls = [a_tag.get("href") for a_tag in a_tags]

        return urls

    def scrape_top_repo_url_multiple_pages(self):
        """Scrape urls of top Github repositories in multiple pages"""
        urls = []
        if isnotebook():
            for page_num in tqdm(
                range(self.start_page_num, self.stop_page_num),
                desc="Scraping top GitHub URLs...",
            ):
                urls.extend(self._scrape_top_repo_url_one_page(page_num))
        else: 
            for page_num in track(
                range(self.start_page_num, self.stop_page_num),
                description="Scraping top GitHub URLs...",
            ):
                urls.extend(self._scrape_top_repo_url_one_page(page_num))

        return urls

class UserProfileGetter:
    """Get the information from users' homepage"""

    def __init__(self, urls: List[str]) -> pd.DataFrame:
        self.urls = urls
        self.profile_features = [
            "login",
            "url",
            "type",
            "name",
            "company",
            "location",
            "email",
            "hireable",
            "bio",
            "public_repos",
            "public_gists",
            "followers",
            "following",
        ]

    def _get_one_user_profile(self, profile_url: str):
        profile = requests.get(profile_url, auth=(USERNAME, TOKEN)).json()
        return {
            key: val
            for key, val in profile.items()
            if key in self.profile_features
        }

    def get_all_user_profiles(self):

        if isnotebook():
            all_contributors = [
            self._get_one_user_profile(url)
            for url in tqdm(
                self.urls, desc="Scraping top GitHub profiles..."
            )
        ]
        else:
            all_contributors = [
                self._get_one_user_profile(url)
                for url in track(
                    self.urls, description="Scraping top GitHub profiles..."
                )
            ]
        all_contributors_df = pd.DataFrame(all_contributors).reset_index(
            drop=True
        )

        return all_contributors_df


def isnotebook():
    try:
        shell = get_ipython().__class__.__name__
        if shell == 'ZMQInteractiveShell':
            return True   # Jupyter notebook or qtconsole
        elif shell == 'TerminalInteractiveShell':
            return False  # Terminal running IPython
        else:
            return False  # Other type (?)
    except NameError:
        return False      # Probably standard Python interpreter

Functions

def isnotebook()
Expand source code
def isnotebook():
    try:
        shell = get_ipython().__class__.__name__
        if shell == 'ZMQInteractiveShell':
            return True   # Jupyter notebook or qtconsole
        elif shell == 'TerminalInteractiveShell':
            return False  # Terminal running IPython
        else:
            return False  # Other type (?)
    except NameError:
        return False      # Probably standard Python interpreter

Classes

class ScrapeGithubUrl (keyword: str, type: str, sort_by: str, start_page_num: int, stop_page_num: int)

Scrape top Github urls based on a certain keyword and type

Parameters

keyword : str
keyword to search on Github
type : str
whether to search for User or Repositories
sort_by : str
sort by best match or most stars, by default 'best_match', which will sort by best match. Use 'stars' to sort by most stars.
start_page_num : int
page number to start scraping. The default is 0
stop_page_num : int
page number to stop scraping

Returns

List[str]
 
Expand source code
class ScrapeGithubUrl:
    """Scrape top Github urls based on a certain keyword and type

    Parameters
    -------
    keyword: str
        keyword to search on Github
    type: str
        whether to search for User or Repositories
    sort_by: str 
        sort by best match or most stars, by default 'best_match', which will sort by best match. 
        Use 'stars' to sort by most stars.
    start_page_num: int
        page number to start scraping. The default is 0
    stop_page_num: int
        page number to stop scraping

    Returns
    -------
    List[str]
    """

    def __init__(self,keyword: str, type: str, sort_by: str, start_page_num: int, stop_page_num: int):
        self.keyword = keyword
        self.type = type
        self.start_page_num = start_page_num
        self.stop_page_num = stop_page_num
        if sort_by =='best_match':
            self.sort_by = ''
        else:
            self.sort_by = sort_by

    @staticmethod
    def _keyword_to_url(page_num: int, keyword: str, type: str, sort_by: str):
        """Change keyword to a url"""
        keyword_no_space = ("+").join(keyword.split(" "))
        return f"https://github.com/search?o=desc&p={str(page_num)}&q={keyword_no_space}&s={sort_by}&type={type}"

    def _scrape_top_repo_url_one_page(self, page_num: int):
        """Scrape urls of top Github repositories in 1 page"""
        url = self._keyword_to_url(page_num, self.keyword, type=self.type, sort_by=self.sort_by)
        page = requests.get(url)

        soup = BeautifulSoup(page.text, "html.parser")
        a_tags = soup.find_all("a", class_=SCRAPE_CLASS[self.type])
        urls = [a_tag.get("href") for a_tag in a_tags]

        return urls

    def scrape_top_repo_url_multiple_pages(self):
        """Scrape urls of top Github repositories in multiple pages"""
        urls = []
        if isnotebook():
            for page_num in tqdm(
                range(self.start_page_num, self.stop_page_num),
                desc="Scraping top GitHub URLs...",
            ):
                urls.extend(self._scrape_top_repo_url_one_page(page_num))
        else: 
            for page_num in track(
                range(self.start_page_num, self.stop_page_num),
                description="Scraping top GitHub URLs...",
            ):
                urls.extend(self._scrape_top_repo_url_one_page(page_num))

        return urls

Methods

def scrape_top_repo_url_multiple_pages(self)

Scrape urls of top Github repositories in multiple pages

Expand source code
def scrape_top_repo_url_multiple_pages(self):
    """Scrape urls of top Github repositories in multiple pages"""
    urls = []
    if isnotebook():
        for page_num in tqdm(
            range(self.start_page_num, self.stop_page_num),
            desc="Scraping top GitHub URLs...",
        ):
            urls.extend(self._scrape_top_repo_url_one_page(page_num))
    else: 
        for page_num in track(
            range(self.start_page_num, self.stop_page_num),
            description="Scraping top GitHub URLs...",
        ):
            urls.extend(self._scrape_top_repo_url_one_page(page_num))

    return urls
class UserProfileGetter (urls: List[str])

Get the information from users' homepage

Expand source code
class UserProfileGetter:
    """Get the information from users' homepage"""

    def __init__(self, urls: List[str]) -> pd.DataFrame:
        self.urls = urls
        self.profile_features = [
            "login",
            "url",
            "type",
            "name",
            "company",
            "location",
            "email",
            "hireable",
            "bio",
            "public_repos",
            "public_gists",
            "followers",
            "following",
        ]

    def _get_one_user_profile(self, profile_url: str):
        profile = requests.get(profile_url, auth=(USERNAME, TOKEN)).json()
        return {
            key: val
            for key, val in profile.items()
            if key in self.profile_features
        }

    def get_all_user_profiles(self):

        if isnotebook():
            all_contributors = [
            self._get_one_user_profile(url)
            for url in tqdm(
                self.urls, desc="Scraping top GitHub profiles..."
            )
        ]
        else:
            all_contributors = [
                self._get_one_user_profile(url)
                for url in track(
                    self.urls, description="Scraping top GitHub profiles..."
                )
            ]
        all_contributors_df = pd.DataFrame(all_contributors).reset_index(
            drop=True
        )

        return all_contributors_df

Methods

def get_all_user_profiles(self)
Expand source code
def get_all_user_profiles(self):

    if isnotebook():
        all_contributors = [
        self._get_one_user_profile(url)
        for url in tqdm(
            self.urls, desc="Scraping top GitHub profiles..."
        )
    ]
    else:
        all_contributors = [
            self._get_one_user_profile(url)
            for url in track(
                self.urls, description="Scraping top GitHub profiles..."
            )
        ]
    all_contributors_df = pd.DataFrame(all_contributors).reset_index(
        drop=True
    )

    return all_contributors_df