Source code for gerrytools.data.acs

import io
from urllib.request import urlopen
from zipfile import ZipFile

import censusdata
import pandas as pd


[docs] def cvap(state, geometry="tract", year=2020) -> pd.DataFrame: """ Retrieves and CSV-formats 5-year CVAP data for the provided state at the specified geometry level. Geometries from the **2010 Census**. Variables and descriptions are [listed here](https://tinyurl.com/3mnrm56s). Args: state (us.State): The `State` object for which we're retrieving 2019 ACS CVAP Special Tab. geometry (str, optional): Level of geometry for which we're getting data. Accepted values are `"block group"` for 2010 Census Block Groups, and `"tract"` for 2010 Census Tracts. Defaults to `"tract"`. year (int, optional): Year for which data is retrieved. Defaults to 2020. Returns A `DataFrame` with a `GEOID` column and corresponding CVAP columns from the ACS CVAP Special Tab for the specified year. """ # Maps line numbers to descriptors. descriptions = { 1: "CVAP", 2: "NHCVAP", 3: "NHAMINCVAP", 4: "NHASIANCVAP", 5: "NHBLACKCVAP", 6: "NHNHPICVAP", 7: "NHWHITECVAP", 8: "NHWHITEAMINCVAP", 9: "NHWHITEASIANCVAP", 10: "NHWHITEBLACKCVAP", 11: "NHBLACKAMINCVAP", 12: "NHOTHCVAP", 13: "HCVAP", } # First, load the raw data requested; allowed geometry values are # "block group" and "tract." if geometry not in {"block group", "tract"}: print(f'Requested geometry "{geometry}" is not allowed; ' "loading tracts.") geometry = "tract" abbrv = geometry if geometry == "tract" else "block group" # Load the raw data. raw = _raw(abbrv, year) # Create a STATE column for filtering and remove all rows which don't match # the state FIPS code. raw["GEOID"] = raw["geoid"].str.split("US").str[1] raw["STATE"] = raw["GEOID"].str[:2] instate = raw[raw["STATE"] == str(state.fips)] # Now that we have the in-state data, we aim to pivot the table. Because # the ACS data is in a line-numbered format (i.e. each chunk of 13 lines # matches to an individual geometry, and each of the 13 lines describes # an individual statistic) we need to first collapse each chunk of 13 # lines, then build a dataframe from the resulting collapsed lines. # First we send the dataframe to a list of records. instate_records = instate.to_dict(orient="records") collapsed = [] # Get year stuff. decade = "10" if year < 2020 else "20" yearsuffix = str(year)[2:] # Next, we collapse these records to a single record. for i in range(0, len(instate_records), 13): # Create an empty records. record = {} # For each of the records in the block, "collapse" them into a single # record. block = instate_records[i : i + 13] for line in block: record[geometry.replace(" ", "").upper() + decade] = line["GEOID"] record[descriptions[line["lnnumber"]] + yearsuffix] = line["cvap_est"] record[descriptions[line["lnnumber"]] + f"{yearsuffix}e"] = line["cvap_moe"] collapsed.append(record) # Create a dataframe and a POCCVAP column; all people minus non-Hispanic # White. data = pd.DataFrame().from_records(collapsed) data[f"POCCVAP{yearsuffix}"] = ( data[f"CVAP{yearsuffix}"] - data[f"NHWHITECVAP{yearsuffix}"] ) return data
[docs] def acs5( state, geometry="tract", year=2020, columns=[], white="NHWHITEVAP" ) -> pd.DataFrame: """ Retrieves ACS 5-year population estimates for the provided state, geometry level, and year. Also retrieves ACS-reported CVAP data, which closely matches that reported by the CVAP special tabulation; CVAP data are only returned at the tract level, and are otherwise reported as 0. Args: state (us.State): `State` object for the desired state. geometry (str, optional): Geometry level at which data is retrieved. Acceptable values are `"tract"` and `"block group"`. Defaults to `"tract"`, so data is retrieved at the 2020 Census tract level. year (int, optional): Year for which data is retrieved. Defaults to 2020. columns (list, optional): Columns to retrieve. If `None`, a default set of columns including total populations by race and ethnicity and voting-age populations by race and ethnicity are returned, along with a GEOID column. white (str, optional): The column removed from totals when calculating POC populations. Returns: A DataFrame containing the formatted data. """ # Columns for total populations. yearsuffix = str(year)[-2:] popcolumns = { "B01001_001E": "TOTPOP" + yearsuffix, "B03002_003E": "WHITE" + yearsuffix, "B03002_004E": "BLACK" + yearsuffix, "B03002_005E": "AMIN" + yearsuffix, "B03002_006E": "ASIAN" + yearsuffix, "B03002_007E": "NHPI" + yearsuffix, "B03002_008E": "OTH" + yearsuffix, "B03002_009E": "2MORE" + yearsuffix, "B03002_002E": "NHISP" + yearsuffix, } # Create a dictionary of column groups. groups = {} # Get VAP columns. The columns listed here are by race, irrespective of # ethnicity; for example, WVAP19 is the group of people who identified # White as their *only* race, including people who identified as # Hispanic and White. vapnames = [ "WHITEVAP", "BLACKVAP", "AMINVAP", "ASIANVAP", "NHPIVAP", "OTHVAP", "2MOREVAP", "NHWHITEVAP", "HVAP", ] vaptables = list( zip( [column + yearsuffix for column in vapnames], ["A", "B", "C", "D", "E", "F", "G", "H", "I"], ) ) groups.update( { column: _variables(f"B01001{table}", 7, 16) + _variables(f"B01001{table}", 22, 31) for column, table in vaptables } ) # Get CVAP columns; the same goes for these columns as does the above, # except these columns are 18 years and older *and* citizens. cvapnames = [ "WHITECVAP", "BLACKCVAP", "AMINCVAP", "ASIANCVAP", "NHPICVAP", "OTHCVAP", "2MORECVAP", "NHWHITECVAP", "HCVAP", ] cvaptables = list( zip( [name + yearsuffix for name in cvapnames], ["A", "B", "C", "D", "E", "F", "G", "H", "I"], ) ) groups.update( { column: _variables(f"B05003{table}", 9, 9) + _variables(f"B05003{table}", 11, 11) + _variables(f"B05003{table}", 20, 20) # men + _variables(f"B05003{table}", 22, 22) # women for column, table in cvaptables } ) # Get all voting-age people and citizen voting-age people. groups["VAP" + yearsuffix] = _variables("B01001", 7, 25) + _variables( "B01001", 31, 49 ) groups["CVAP" + yearsuffix] = ( _variables("B05003", 9, 9) + _variables("B05003", 11, 11) + _variables("B05003", 20, 20) + _variables("B05003", 22, 22) ) # TODO: all variables used across the data submodule should be packaged up # as a class, so we can access individual dictionaries of variables to add. # For example, we should have a `Variables.acs5.vap` property which gives # us the voting-age population variables for the ACS 5-year estimates. # Get the list of all columns. allcols = ( list(popcolumns.keys()) + [c for k in groups.values() for c in k] + columns ) # Retrieve the data from the Census API. data = censusdata.download( "acs5", year, censusdata.censusgeo( [("state", str(state.fips).zfill(2)), ("county", "*"), (geometry, "*")] ), ["GEO_ID"] + allcols, ) # Rework columns. data = data.reset_index(drop=True) data["GEO_ID"] = data["GEO_ID"].str.split("US").str[1] data = data.rename( {"GEO_ID": geometry.replace(" ", "").upper() + ("10" if year < 2020 else "20")}, axis=1, ) data = data.rename(popcolumns, axis=1) # Collapse column groups. for column, group in groups.items(): data[column] = data[group].sum(axis=1) data = data.drop(group, axis=1) # Create a POCVAP column. data[f"POCVAP{yearsuffix}"] = ( data[f"VAP{yearsuffix}"] - data[f"{white}{yearsuffix}"] ) return data
def _variables(prefix, start, stop, suffix="E") -> list: """ Returns the ACS variable names from the provided prefix, start, stop, and suffix parameters. Used to generate batches of names, especially for things like voting-age population. Variable names are formatted like `<prefix>_<number identifier><suffix>`, where `<prefix>` is a population grouping, `<number identifier>` is the number of the variable in that grouping, and `<suffix>` designates the file used. [Variables are listed here ](https://tinyurl.com/43ajptky>). Args: prefix (str): Population grouping; typically "B01001." These prefixes change based on subpopulation: for example, the prefix for Black age-by-sex tables is "B01001B"; for Hispanic and Latino, it is "B01001I." start (int): Where to start numbering. stop (int): Where to stop numbering. Inclusive. suffix (str): Suffix designating the file. For most purposes, this is "E." Returns: A list of ACS5 variable names. """ return [f"{prefix}_{str(t).zfill(3)}{suffix}" for t in range(start, stop + 1)] def _retrieve(year, geometry="tract"): """ Downloads and extracts compressed CVAP data for the specified year. Args: year (int): Year for which we're grabbing CVAP data. geometry (str, optional): Geometry level for which we're grabbing CVAP data. Defaults to `"tract"`. Returns: In-memory text stream of decompressed CSV data. """ # Create a mapping from geometry names to filenames. levels = {"block group": "BlockGr.csv", "tract": "Tract.csv"} inverted = {v: k for k, v in levels.items()} # Construct the URL. start, stop = year - 4, year root = "https://www2.census.gov/programs-surveys/decennial/rdo/datasets/" suffix = f"{stop}/{stop}-cvap/CVAP_{start}-{stop}_ACS_csv_files.zip" # Make the request and extract only the required files. with urlopen(root + suffix) as resource: with ZipFile(io.BytesIO(resource.read())) as archive: files = { inverted[file]: archive.read(file).decode(encoding="ISO-8859-1") for file in archive.namelist() if file == levels[geometry] } # Return the raw extracted file. return files[geometry] def _raw(geometry, year) -> pd.DataFrame: """ Reads raw CVAP data from the local repository. Args: geometry (str): Level of geometry for which we're getting 2019 CVAP data. year (int): Year for which data is retrieved. Returns: A DataFrame, where each block of 13 rows corresponds to an individual geometric unit (2010 Census Block Group, 2010 Census Tract) and each row in a given block corresponds to a CVAP statistic for that block's geometric unit. """ # Retrieve the data at the specified geometry level and return # it as a dataframe. return pd.read_csv(io.StringIO(_retrieve(year, geometry)), encoding="ISO-8859-1")