import io
from urllib.request import urlopen
from zipfile import ZipFile
import censusdata
import pandas as pd
[docs]
def cvap(state, geometry="tract", year=2020) -> pd.DataFrame:
"""
Retrieves and CSV-formats 5-year CVAP data for the provided state at
the specified geometry level. Geometries from the **2010 Census**.
Variables and descriptions are [listed here](https://tinyurl.com/3mnrm56s).
Args:
state (us.State): The `State` object for which we're retrieving 2019
ACS CVAP Special Tab.
geometry (str, optional): Level of geometry for which we're getting
data. Accepted values are `"block group"` for 2010 Census Block
Groups, and `"tract"` for 2010 Census Tracts. Defaults to `"tract"`.
year (int, optional): Year for which data is retrieved. Defaults to
2020.
Returns
A `DataFrame` with a `GEOID` column and corresponding CVAP columns from
the ACS CVAP Special Tab for the specified year.
"""
# Maps line numbers to descriptors.
descriptions = {
1: "CVAP",
2: "NHCVAP",
3: "NHAMINCVAP",
4: "NHASIANCVAP",
5: "NHBLACKCVAP",
6: "NHNHPICVAP",
7: "NHWHITECVAP",
8: "NHWHITEAMINCVAP",
9: "NHWHITEASIANCVAP",
10: "NHWHITEBLACKCVAP",
11: "NHBLACKAMINCVAP",
12: "NHOTHCVAP",
13: "HCVAP",
}
# First, load the raw data requested; allowed geometry values are
# "block group" and "tract."
if geometry not in {"block group", "tract"}:
print(f'Requested geometry "{geometry}" is not allowed; ' "loading tracts.")
geometry = "tract"
abbrv = geometry if geometry == "tract" else "block group"
# Load the raw data.
raw = _raw(abbrv, year)
# Create a STATE column for filtering and remove all rows which don't match
# the state FIPS code.
raw["GEOID"] = raw["geoid"].str.split("US").str[1]
raw["STATE"] = raw["GEOID"].str[:2]
instate = raw[raw["STATE"] == str(state.fips)]
# Now that we have the in-state data, we aim to pivot the table. Because
# the ACS data is in a line-numbered format (i.e. each chunk of 13 lines
# matches to an individual geometry, and each of the 13 lines describes
# an individual statistic) we need to first collapse each chunk of 13
# lines, then build a dataframe from the resulting collapsed lines.
# First we send the dataframe to a list of records.
instate_records = instate.to_dict(orient="records")
collapsed = []
# Get year stuff.
decade = "10" if year < 2020 else "20"
yearsuffix = str(year)[2:]
# Next, we collapse these records to a single record.
for i in range(0, len(instate_records), 13):
# Create an empty records.
record = {}
# For each of the records in the block, "collapse" them into a single
# record.
block = instate_records[i : i + 13]
for line in block:
record[geometry.replace(" ", "").upper() + decade] = line["GEOID"]
record[descriptions[line["lnnumber"]] + yearsuffix] = line["cvap_est"]
record[descriptions[line["lnnumber"]] + f"{yearsuffix}e"] = line["cvap_moe"]
collapsed.append(record)
# Create a dataframe and a POCCVAP column; all people minus non-Hispanic
# White.
data = pd.DataFrame().from_records(collapsed)
data[f"POCCVAP{yearsuffix}"] = (
data[f"CVAP{yearsuffix}"] - data[f"NHWHITECVAP{yearsuffix}"]
)
return data
[docs]
def acs5(
state, geometry="tract", year=2020, columns=[], white="NHWHITEVAP"
) -> pd.DataFrame:
"""
Retrieves ACS 5-year population estimates for the provided state, geometry
level, and year. Also retrieves ACS-reported CVAP data, which closely
matches that reported by the CVAP special tabulation; CVAP data are only
returned at the tract level, and are otherwise reported as 0.
Args:
state (us.State): `State` object for the desired state.
geometry (str, optional): Geometry level at which data is retrieved.
Acceptable values are `"tract"` and `"block group"`. Defaults to
`"tract"`, so data is retrieved at the 2020 Census tract level.
year (int, optional): Year for which data is retrieved. Defaults to
2020.
columns (list, optional): Columns to retrieve. If `None`, a default set
of columns including total populations by race and ethnicity and
voting-age populations by race and ethnicity are returned, along
with a GEOID column.
white (str, optional): The column removed from totals when calculating
POC populations.
Returns:
A DataFrame containing the formatted data.
"""
# Columns for total populations.
yearsuffix = str(year)[-2:]
popcolumns = {
"B01001_001E": "TOTPOP" + yearsuffix,
"B03002_003E": "WHITE" + yearsuffix,
"B03002_004E": "BLACK" + yearsuffix,
"B03002_005E": "AMIN" + yearsuffix,
"B03002_006E": "ASIAN" + yearsuffix,
"B03002_007E": "NHPI" + yearsuffix,
"B03002_008E": "OTH" + yearsuffix,
"B03002_009E": "2MORE" + yearsuffix,
"B03002_002E": "NHISP" + yearsuffix,
}
# Create a dictionary of column groups.
groups = {}
# Get VAP columns. The columns listed here are by race, irrespective of
# ethnicity; for example, WVAP19 is the group of people who identified
# White as their *only* race, including people who identified as
# Hispanic and White.
vapnames = [
"WHITEVAP",
"BLACKVAP",
"AMINVAP",
"ASIANVAP",
"NHPIVAP",
"OTHVAP",
"2MOREVAP",
"NHWHITEVAP",
"HVAP",
]
vaptables = list(
zip(
[column + yearsuffix for column in vapnames],
["A", "B", "C", "D", "E", "F", "G", "H", "I"],
)
)
groups.update(
{
column: _variables(f"B01001{table}", 7, 16)
+ _variables(f"B01001{table}", 22, 31)
for column, table in vaptables
}
)
# Get CVAP columns; the same goes for these columns as does the above,
# except these columns are 18 years and older *and* citizens.
cvapnames = [
"WHITECVAP",
"BLACKCVAP",
"AMINCVAP",
"ASIANCVAP",
"NHPICVAP",
"OTHCVAP",
"2MORECVAP",
"NHWHITECVAP",
"HCVAP",
]
cvaptables = list(
zip(
[name + yearsuffix for name in cvapnames],
["A", "B", "C", "D", "E", "F", "G", "H", "I"],
)
)
groups.update(
{
column: _variables(f"B05003{table}", 9, 9)
+ _variables(f"B05003{table}", 11, 11)
+ _variables(f"B05003{table}", 20, 20) # men
+ _variables(f"B05003{table}", 22, 22) # women
for column, table in cvaptables
}
)
# Get all voting-age people and citizen voting-age people.
groups["VAP" + yearsuffix] = _variables("B01001", 7, 25) + _variables(
"B01001", 31, 49
)
groups["CVAP" + yearsuffix] = (
_variables("B05003", 9, 9)
+ _variables("B05003", 11, 11)
+ _variables("B05003", 20, 20)
+ _variables("B05003", 22, 22)
)
# TODO: all variables used across the data submodule should be packaged up
# as a class, so we can access individual dictionaries of variables to add.
# For example, we should have a `Variables.acs5.vap` property which gives
# us the voting-age population variables for the ACS 5-year estimates.
# Get the list of all columns.
allcols = (
list(popcolumns.keys()) + [c for k in groups.values() for c in k] + columns
)
# Retrieve the data from the Census API.
data = censusdata.download(
"acs5",
year,
censusdata.censusgeo(
[("state", str(state.fips).zfill(2)), ("county", "*"), (geometry, "*")]
),
["GEO_ID"] + allcols,
)
# Rework columns.
data = data.reset_index(drop=True)
data["GEO_ID"] = data["GEO_ID"].str.split("US").str[1]
data = data.rename(
{"GEO_ID": geometry.replace(" ", "").upper() + ("10" if year < 2020 else "20")},
axis=1,
)
data = data.rename(popcolumns, axis=1)
# Collapse column groups.
for column, group in groups.items():
data[column] = data[group].sum(axis=1)
data = data.drop(group, axis=1)
# Create a POCVAP column.
data[f"POCVAP{yearsuffix}"] = (
data[f"VAP{yearsuffix}"] - data[f"{white}{yearsuffix}"]
)
return data
def _variables(prefix, start, stop, suffix="E") -> list:
"""
Returns the ACS variable names from the provided prefix, start, stop, and
suffix parameters. Used to generate batches of names, especially for things
like voting-age population. Variable names are formatted like
`<prefix>_<number identifier><suffix>`, where `<prefix>` is a
population grouping,
`<number identifier>` is the number of the variable in that grouping, and
`<suffix>` designates the file used. [Variables are listed
here ](https://tinyurl.com/43ajptky>).
Args:
prefix (str): Population grouping; typically "B01001." These prefixes
change based on subpopulation: for example, the prefix for Black
age-by-sex tables is "B01001B"; for Hispanic and Latino, it is
"B01001I."
start (int): Where to start numbering.
stop (int): Where to stop numbering. Inclusive.
suffix (str): Suffix designating the file. For most purposes, this is "E."
Returns:
A list of ACS5 variable names.
"""
return [f"{prefix}_{str(t).zfill(3)}{suffix}" for t in range(start, stop + 1)]
def _retrieve(year, geometry="tract"):
"""
Downloads and extracts compressed CVAP data for the specified year.
Args:
year (int): Year for which we're grabbing CVAP data.
geometry (str, optional): Geometry level for which we're grabbing CVAP
data. Defaults to `"tract"`.
Returns:
In-memory text stream of decompressed CSV data.
"""
# Create a mapping from geometry names to filenames.
levels = {"block group": "BlockGr.csv", "tract": "Tract.csv"}
inverted = {v: k for k, v in levels.items()}
# Construct the URL.
start, stop = year - 4, year
root = "https://www2.census.gov/programs-surveys/decennial/rdo/datasets/"
suffix = f"{stop}/{stop}-cvap/CVAP_{start}-{stop}_ACS_csv_files.zip"
# Make the request and extract only the required files.
with urlopen(root + suffix) as resource:
with ZipFile(io.BytesIO(resource.read())) as archive:
files = {
inverted[file]: archive.read(file).decode(encoding="ISO-8859-1")
for file in archive.namelist()
if file == levels[geometry]
}
# Return the raw extracted file.
return files[geometry]
def _raw(geometry, year) -> pd.DataFrame:
"""
Reads raw CVAP data from the local repository.
Args:
geometry (str): Level of geometry for which we're getting 2019 CVAP
data.
year (int): Year for which data is retrieved.
Returns:
A DataFrame, where each block of 13 rows corresponds to an individual
geometric unit (2010 Census Block Group, 2010 Census Tract) and
each row in a given block corresponds to a CVAP statistic for that
block's geometric unit.
"""
# Retrieve the data at the specified geometry level and return
# it as a dataframe.
return pd.read_csv(io.StringIO(_retrieve(year, geometry)), encoding="ISO-8859-1")