import io
import json
from datetime import datetime
from typing import List, Tuple, Union
import pandas as pd
import requests
from pydantic import BaseModel
from .URLs import csvs, ids, one
[docs]
class Submission(BaseModel):
"""
Provides a base model for data retrieved from districtr. Allows us to use
dot notation when accessing properties rather than dict notation.
"""
link: str
"""A districtr URL."""
plan: dict
"""districtr plan object."""
id: str
"""districtr identifier."""
units: str
"""Unit identifier (e.g. `GEOID`)."""
unitsType: str
"""Unit type (e.g. `blocks20`, `blockgroup`, etc.)"""
tileset: str
"""Mapbox tileset URL."""
type: str
"""Not sure."""
[docs]
def tabularized(state, submissions) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""
Returns districtr submission information in a tabular format.
Args:
state (State): `us.State` object (e.g. `us.states.WI`).
submissions (list): List of `Submission` objects returned from a
call to `submissions`.
Returns:
Three dataframes corresponding to plan-based submissions, COI-based
submissions, and written submissions to the provided state.
Example:
Prototypical example usage.
import us
from gerrytools.retrieve import submissions, tabularized
# Set the state.
state = us.states.WI
# Retrieve the raw districtr submissions, then tabularize them.
subs = submissions(state)
plans, cois, written = tabularized(state, subs)
"""
# Sort submissions. (Not sure why this is necessary? Holdover from previous
# fetching code.)
submissions = list(sorted(submissions, key=lambda s: s.id))
# Categorize into three categories: plan submissions, COI submissions, and
# written submissions (which are ignored as they don't appear in the list
# of submissions).
_plans = [s.dict() for s in submissions if s.type == "plan"]
_cois = [s.dict() for s in submissions if s.type == "coi"]
# Create preliminary dataframes so we can do safe `merge`s rather than rely
# explicitly on sorting; this also allows us to specify a sample size if
# we're only looking to sample a specific number of plans.
subset_plans = pd.DataFrame.from_records(_plans)
subset_cois = pd.DataFrame.from_records(_cois)
# Get appropriate URLs and create dataframes.
plans_url = csvs(state)
cois_url = csvs(state, ptype="coi")
written_url = csvs(state, ptype="written")
plans = as_dataframe(plans_url)
cois = as_dataframe(cois_url)
writtens = as_dataframe(written_url)
# Adjust column contents for the plan and COI dataframes.
for universe in [plans, cois]:
# Adjust the `link` column type and create an `id` column from it.
universe["link"] = universe["link"].astype(str)
universe["id"] = parse_id(universe["link"])
# Adjust column contents for all dataframes.
for df in [plans, cois, writtens]:
df["datetime"] = parse_datetime(df["datetime"])
# Add the retrieved plan data to the dataframes *if the subset dataframes
# contain items*.
if not subset_plans.empty:
plans = plans.merge(subset_plans, on="id")
else:
plans = pd.DataFrame()
if not subset_cois.empty:
cois = cois.merge(subset_cois, on="id")
else:
cois = pd.DataFrame()
# Drop bad columns and rename. Not sure why we have to `inplace`
# things here, but... fine.
for df in [plans, cois]:
if not df.empty:
# Remove columns we don't necessarily care about.
for col in ["type_x", "link_x", "coalition"]:
if col in list(df):
df.drop(col, axis=1, inplace=True)
# Rename the columns we do care about.
df.rename({"type_y": "type", "link_y": "link"}, axis=1, inplace=True)
return plans, cois, writtens
[docs]
def submissions(state, sample=None) -> List[Submission]:
"""
Retrieves raw districtr objects; this includes both plan- and COI-based
submissions.
Args:
state (State): `us.State` object (e.g. `us.states.WI`).
sample (int, optional): The number of sample plans to retrieve.
Returns:
A list of `Submissions`, either to be interpreted raw or tabularized.
"""
# Get the appropriate URL and send the request. Made some basic ASCII
# art with the second three variable names... it's like the request
# is loading letter by letter.
url = ids(state)
__w = requests.get(url).text
_aw = json.loads(__w)["ids"]
raw = _aw[:sample] if sample else _aw
# Create `Submission` objects for each of the retrieved objects.
# Getting the individual plans is the bottleneck here, and
# unfortunately we can't retrieve them in bulk (... or can we?).
submissions = []
for entity in raw:
# Retrieve the required data points.
identifier = parse_id(entity["link"], df=False)
districtr = individual(identifier)
# Force all plan keys and values to strings.
try:
plan = {
str(k): str(v) if not isinstance(v, list) else str(v[0])
for k, v in districtr["plan"]["assignment"].items()
}
units = districtr["plan"]["units"]["name"]
unitsType = districtr["plan"]["units"]["unitType"]
tileset = districtr["plan"]["units"]["tilesets"][0]["sourceLayer"]
# Create a new Submission.
submissions.append(
Submission(
link=entity["link"],
id=identifier,
plan=plan,
units=units,
unitsType=unitsType,
tileset=tileset,
type=entity["type"],
)
)
except BaseException:
pass
return submissions
def as_dataframe(url) -> pd.DataFrame:
"""
Retrieves encoded submission data from the provided URL and parses it into
a pandas `DataFrame`.
Args:
url (str): Wherever we're getting things from.
"""
raw = requests.get(url).content
return pd.read_csv(io.StringIO(raw.decode("utf-8")), parse_dates=True)
def individual(identifier) -> dict:
"""
Retrieves districtr data for an individual plan.
Args:
identifier (str): districtr identifier for an individual plan.
Returns:
districtr plan object (as a dictionary).
"""
raw = requests.get(one(identifier))
return json.loads(raw.text)
def parse_id(link, df=True) -> Union[str, pd.Series]:
"""
Given a districtr link, parse out the districtr identifier.
Args:
l (str): districtr url containing the districtr ID of the provided
plan.
df (bool, optional): If `l` is a dataframe, then we use pandas string
operations rather than built-in ones.
Returns:
districtr ID.
"""
if df:
return link.str.split("/").str[-1].str.split("?").str[0]
return link.split("/")[-1].split("?")[0]
def parse_datetime(d) -> pd.Series:
"""
Parses the timestamps in the dataframe returned by `as_dataframe()`.
Args:
d (str): Column of the dataframe containing timestamps.
Returns:
`d` with its datetimes parsed correctly.
"""
# Parse datetimes.
prefix = d.str.split("+").str[0]
suffix = d.str.split("+").str[1].str.split(" ").str[0]
dt = prefix + " +" + suffix
# Convert datetimes.
return dt.apply(lambda r: datetime.strptime(r, "%a %b %d %Y %X %Z %z"))