Add basic support for 1996 census data

finchie
Sep 7, 2023, 11:07 AM
QU7MDQED456DFUKHLHCS2SS5QSLDYFU233YNQ27KY2TOVDMTSUKAC

Dependencies

  • [2] QDEBWR2R Add missing state time series data
  • [3] 7ZN6HHL2 Extract relevant census data
  • [4] FCHAPZLD Implement simpler file merging function

Change contents

  • edit in project.py at line 34
    [3.988]
    [3.196]
    def remove_quoted_commas(line):
    inside_quote = False
    cleaned_quote = ""
    for character in line:
    if character == '"':
    inside_quote = not inside_quote
    elif character == ",":
    if not inside_quote:
    cleaned_quote += character
    else:
    cleaned_quote += character
    # Make sure no quotes are unclosed
    assert not inside_quote
    return cleaned_quote
    # Base dataset functionality, can be used to see broad historical trends from 1996-2021
    class CensusDataset:
    def census_year(self):
    raise NotImplementedError
    def median_incomes(self):
    raise NotImplementedError
    def median_rents(self):
    raise NotImplementedError
    class Census1996(CensusDataset):
    def __init__(self, filename):
    file = open(filename, "r").read()
    # Split the file up into sections
    self.sections = []
    # Each section is separated by this string
    for section in file.split(",,,,,,,,,,,,,"):
    section = section.strip()
    # Some lines are just section breaks, so ignore those
    if section != "":
    self.sections.append(section)
    self.WEEKLY_RENT_HEADING = self.sections[4]
    self.POPULATION_BY_INCOME = self.sections[5]
    # Extract the median rent (midpoint between range) for each column
    median_rents = self.median_rents()
    # Extract the median household income (midpoint between each range) for each bracket
    median_incomes = self.median_incomes()
    # Extract the actual table
    population_data = self.population_data()
    from pprint import pprint
    pprint(population_data)
    def census_year(self):
    return 1996
    def filter_rent_brackets(self):
    # Remove commas inside of quotes as to not break parsing
    weekly_rent_heading = remove_quoted_commas(self.WEEKLY_RENT_HEADING)
    assert weekly_rent_heading.startswith(",,,,,, Weekly rent,,,,,,,")
    assert weekly_rent_heading.count("\n") == 2
    # The headings are split between 2 lines (using example $0-$99):
    # 1st line: start of range (eg $0-)
    # 2nd line: end of range (eg $99)
    [start_ranges, end_ranges] = weekly_rent_heading.splitlines()[1:]
    # Remove comma prefix and split into columns
    start_ranges = start_ranges[1:].split(",")
    end_ranges = end_ranges[1:].split(",")
  • edit in project.py at line 110
    [3.197]
    [3.197]
    # Make sure there is a matching number of cells
    assert len(end_ranges) == len(start_ranges)
  • edit in project.py at line 113
    [3.198]
    [3.251]
    # Remove the last 3 columns:
    # 1. $1000 or more
    # 2. Not stated
    # 3. Total
    assert start_ranges[-3:] == ["$1000", "Not", ""]
    assert end_ranges[-3:] == ["or more", "stated", "Total"]
    start_ranges = start_ranges[:-3]
    end_ranges = end_ranges[:-3]
    return (start_ranges, end_ranges)
    def parse_rent_bracket(self, start, end):
    assert start.startswith("$")
    assert start.endswith("-")
    assert end.startswith("$")
    # Strip '$' prefix and '-' suffix
    start = start[1:-1]
    # Strip '$' prefix
    end = end[1:]
    # Make sure we have filtered out bad categories such as 'Not stated'
    assert should_keep(start + end)
    # Downcast ranges and calculate midpoint
    start = int(start)
    end = int(end)
    return (start, end)
    def median_rents(self):
    (start_ranges, end_ranges) = self.filter_rent_brackets()
    # Extract the ranges and store as the middle point (eg $50 for $0-$99)
    rents = []
    for column in range(len(start_ranges)):
    start = start_ranges[column]
    end = end_ranges[column]
    (start, end) = self.parse_rent_bracket(start, end)
    rents.append((start + end + 1) // 2)
    return rents
    def filter_income_brackets(self):
    rows = self.POPULATION_BY_INCOME.splitlines()
    # Remove rows containing irrelevant data
    assert rows[0].startswith("Negative income")
    assert rows[1].startswith("Nil income")
    assert rows[-3].startswith('"$2,000 or more"')
    assert rows[-2].startswith("Partial income stated(a)")
    assert rows[-1].startswith("All incomes not stated(b)")
    rows = rows[2:-3]
    return rows
    def extract_income_columns(self, line):
    # Convert '"$1,000"' -> '$1000'
    line = remove_quoted_commas(line)
    # Split into columns
    line = line.split(",")
    return line
    def median_incomes(self):
    incomes = []
    for line in self.filter_income_brackets():
    line = self.extract_income_columns(line)
    # Two dollar amounts separated by a hyphen ('-')
    # row[0] will be values such as: '$1-$39'
    [start, end] = line[0].split("-")
    # Make sure we've got data in the right shape
    assert start.startswith("$")
    assert end.startswith("$")
    assert should_keep(line[0])
    # Strip prefixes
    start = start[1:]
    end = end[1:]
    # Downcast and calculate midpoint
    start = int(start)
    end = int(end)
    incomes.append((start + end + 1) // 2)
    return incomes
    def population_data(self):
    populations = []
    for line in self.filter_income_brackets():
    # We don't care about income brackets, only population data
    columns = self.extract_income_columns(line)[1:]
    columns_for_line = []
    for column in columns:
    columns_for_line.append(int(column))
    populations.append(columns_for_line)
    return populations
  • replacement in project.py at line 324
    [3.4189][3.4189:4269]()
    bad_words = ["Neg_Ni_inc", "4000more", "R650more", "R_NS", "R_Tot", "PI_S"]
    [3.4189]
    [3.4269]
    bad_words = [
    "Neg_Ni_inc",
    "4000more",
    "R650more",
    "R_NS",
    "R_Tot",
    "PI_S",
    "or more",
    "Notstated",
    ]
  • replacement in project.py at line 447
    [3.1010][2.430:498]()
    parts = merge_time_series_files(STATE_TIME_SERIES)
    print(parts[-1])
    [3.1010]
    [3.7984]
    # parts = merge_time_series_files(STATE_TIME_SERIES)
    data = Census1996("Data/1996/1996_income_by_rent.csv")