Module top_github_scraper.scrape_user
Expand source code
import json
import os
from pathlib import Path
from dotenv import load_dotenv
from rich import print
from top_github_scraper.utils import ScrapeGithubUrl, UserProfileGetter
load_dotenv()
USERNAME = os.getenv("GITHUB_USERNAME")
TOKEN = os.getenv("GITHUB_TOKEN")
def get_top_user_urls(
keyword: str,
save_directory: str=".",
start_page: int = 1,
stop_page: int = 50,
):
"""Get the URLs of the repositories pop up when searching for a specific
keyword on GitHub.
Parameters
----------
keyword : str
Keyword to search for (.i.e, machine learning)
save_directory: str, optional
directory to save the output file, by default "."
start_page : int, optional
page number to start scraping from, by default 1
stop_page : int, optional
page number of the last page to scrape, by default 50
"""
Path(save_directory).mkdir(parents=True, exist_ok=True)
save_path = f"{save_directory}/top_repo_urls_{keyword}_{start_page}_{stop_page}.json"
repo_urls = ScrapeGithubUrl(
keyword, 'Users', 'followers', start_page, stop_page
).scrape_top_repo_url_multiple_pages()
with open(save_path, "w") as outfile:
json.dump(repo_urls, outfile)
def get_top_users(
keyword: int,
start_page: int = 1,
stop_page: int = 50,
save_directory: str="."
):
"""
Get the information of the owners and contributors of the repositories pop up when searching for a specific
keyword on GitHub.
Parameters
----------
keyword : str
Keyword to search for (.i.e, machine learning)
start_page : int, optional
page number to start scraping from, by default 1
stop_page : int, optional
page number of the last page to scrape, by default 50
save_directory: str, optional
directory to save the output file, by default "."
"""
full_url_save_path = (
f"{save_directory}/top_repo_urls_{keyword}_{start_page}_{stop_page}.json"
)
user_save_path = f"{save_directory}/top_user_info_{keyword}_{start_page}_{stop_page}.csv"
if not Path(full_url_save_path).exists():
get_top_user_urls(
keyword=keyword,
start_page=start_page,
stop_page=stop_page,
save_directory=save_directory,
)
with open(full_url_save_path, "r") as infile:
user_urls = json.load(infile)
url = 'https://api.github.com/users'
urls = [url + user for user in user_urls]
top_users = UserProfileGetter(urls).get_all_user_profiles()
top_users.to_csv(user_save_path)
return top_users
Functions
def get_top_user_urls(keyword: str, save_directory: str = '.', start_page: int = 1, stop_page: int = 50)
-
Get the URLs of the repositories pop up when searching for a specific keyword on GitHub.
Parameters
keyword
:str
- Keyword to search for (.i.e, machine learning)
save_directory
:str
, optional- directory to save the output file, by default "."
start_page
:int
, optional- page number to start scraping from, by default 1
stop_page
:int
, optional- page number of the last page to scrape, by default 50
Expand source code
def get_top_user_urls( keyword: str, save_directory: str=".", start_page: int = 1, stop_page: int = 50, ): """Get the URLs of the repositories pop up when searching for a specific keyword on GitHub. Parameters ---------- keyword : str Keyword to search for (.i.e, machine learning) save_directory: str, optional directory to save the output file, by default "." start_page : int, optional page number to start scraping from, by default 1 stop_page : int, optional page number of the last page to scrape, by default 50 """ Path(save_directory).mkdir(parents=True, exist_ok=True) save_path = f"{save_directory}/top_repo_urls_{keyword}_{start_page}_{stop_page}.json" repo_urls = ScrapeGithubUrl( keyword, 'Users', 'followers', start_page, stop_page ).scrape_top_repo_url_multiple_pages() with open(save_path, "w") as outfile: json.dump(repo_urls, outfile)
def get_top_users(keyword: int, start_page: int = 1, stop_page: int = 50, save_directory: str = '.')
-
Get the information of the owners and contributors of the repositories pop up when searching for a specific keyword on GitHub. Parameters
keyword
:str
- Keyword to search for (.i.e, machine learning)
start_page
:int
, optional- page number to start scraping from, by default 1
stop_page
:int
, optional- page number of the last page to scrape, by default 50
save_directory
:str
, optional- directory to save the output file, by default "."
Expand source code
def get_top_users( keyword: int, start_page: int = 1, stop_page: int = 50, save_directory: str="." ): """ Get the information of the owners and contributors of the repositories pop up when searching for a specific keyword on GitHub. Parameters ---------- keyword : str Keyword to search for (.i.e, machine learning) start_page : int, optional page number to start scraping from, by default 1 stop_page : int, optional page number of the last page to scrape, by default 50 save_directory: str, optional directory to save the output file, by default "." """ full_url_save_path = ( f"{save_directory}/top_repo_urls_{keyword}_{start_page}_{stop_page}.json" ) user_save_path = f"{save_directory}/top_user_info_{keyword}_{start_page}_{stop_page}.csv" if not Path(full_url_save_path).exists(): get_top_user_urls( keyword=keyword, start_page=start_page, stop_page=stop_page, save_directory=save_directory, ) with open(full_url_save_path, "r") as infile: user_urls = json.load(infile) url = 'https://api.github.com/users' urls = [url + user for user in user_urls] top_users = UserProfileGetter(urls).get_all_user_profiles() top_users.to_csv(user_save_path) return top_users