finchie/census - Change QU7MDQED456DFUKHLHCS2SS5QSLDYFU233YNQ27KY2TOVDMTSUKAC

Add basic support for 1996 census data

This is the first dataset that uses classes and all the others will probably migrate over

Created by finchie on September 7, 2023

QU7MDQED456DFUKHLHCS2SS5QSLDYFU233YNQ27KY2TOVDMTSUKAC

Dependencies

In channels

main

Change contents

Insertion in project.py at line 34 [4.35]

[3.988]

[4.196]


def remove_quoted_commas(line):
    inside_quote = False
    cleaned_quote = ""
    for character in line:
        if character == '"':
            inside_quote = not inside_quote
        elif character == ",":
            if not inside_quote:
                cleaned_quote += character
        else:
            cleaned_quote += character
    # Make sure no quotes are unclosed
    assert not inside_quote
    return cleaned_quote
# Base dataset functionality, can be used to see broad historical trends from 1996-2021
class CensusDataset:
    def census_year(self):
        raise NotImplementedError
    def median_incomes(self):
        raise NotImplementedError
    def median_rents(self):
        raise NotImplementedError
class Census1996(CensusDataset):
    def __init__(self, filename):
        file = open(filename, "r").read()
        # Split the file up into sections
        self.sections = []
        # Each section is separated by this string
        for section in file.split(",,,,,,,,,,,,,"):
            section = section.strip()
            # Some lines are just section breaks, so ignore those
            if section != "":
                self.sections.append(section)
        self.WEEKLY_RENT_HEADING = self.sections[4]
        self.POPULATION_BY_INCOME = self.sections[5]
        # Extract the median rent (midpoint between range) for each column
        median_rents = self.median_rents()
        # Extract the median household income (midpoint between each range) for each bracket
        median_incomes = self.median_incomes()
        # Extract the actual table
        population_data = self.population_data()
        from pprint import pprint
        pprint(population_data)
    def census_year(self):
        return 1996
    def filter_rent_brackets(self):
        # Remove commas inside of quotes as to not break parsing
        weekly_rent_heading = remove_quoted_commas(self.WEEKLY_RENT_HEADING)
        assert weekly_rent_heading.startswith(",,,,,,     Weekly rent,,,,,,,")
        assert weekly_rent_heading.count("\n") == 2
        # The headings are split between 2 lines (using example $0-$99):
        # 1st line: start of range (eg $0-)
        # 2nd line: end of range (eg $99)
        [start_ranges, end_ranges] = weekly_rent_heading.splitlines()[1:]
        # Remove comma prefix and split into columns
        start_ranges = start_ranges[1:].split(",")
        end_ranges = end_ranges[1:].split(",")

Insertion in project.py at line 110 [4.35]

[4.197]

        # Make sure there is a matching number of cells
        assert len(end_ranges) == len(start_ranges)

Insertion in project.py at line 113 [4.35]

[4.198]

[4.251]

        # Remove the last 3 columns:
        # 1. $1000 or more
        # 2. Not stated
        # 3. Total
        assert start_ranges[-3:] == ["$1000", "Not", ""]
        assert end_ranges[-3:] == ["or more", "stated", "Total"]
        start_ranges = start_ranges[:-3]
        end_ranges = end_ranges[:-3]
        return (start_ranges, end_ranges)
    def parse_rent_bracket(self, start, end):
        assert start.startswith("$")
        assert start.endswith("-")
        assert end.startswith("$")
        # Strip '$' prefix and '-' suffix
        start = start[1:-1]
        # Strip '$' prefix
        end = end[1:]
        # Make sure we have filtered out bad categories such as 'Not stated'
        assert should_keep(start + end)
        # Downcast ranges and calculate midpoint
        start = int(start)
        end = int(end)
        return (start, end)
    def median_rents(self):
        (start_ranges, end_ranges) = self.filter_rent_brackets()
        # Extract the ranges and store as the middle point (eg $50 for $0-$99)
        rents = []
        for column in range(len(start_ranges)):
            start = start_ranges[column]
            end = end_ranges[column]
            (start, end) = self.parse_rent_bracket(start, end)
            rents.append((start + end + 1) // 2)
        return rents
    def filter_income_brackets(self):
        rows = self.POPULATION_BY_INCOME.splitlines()
        # Remove rows containing irrelevant data
        assert rows[0].startswith("Negative income")
        assert rows[1].startswith("Nil income")
        assert rows[-3].startswith('"$2,000 or more"')
        assert rows[-2].startswith("Partial income stated(a)")
        assert rows[-1].startswith("All incomes not stated(b)")
        rows = rows[2:-3]
        return rows
    def extract_income_columns(self, line):
        # Convert '"$1,000"' -> '$1000'
        line = remove_quoted_commas(line)
        # Split into columns
        line = line.split(",")
        return line
    def median_incomes(self):
        incomes = []
        for line in self.filter_income_brackets():
            line = self.extract_income_columns(line)
            # Two dollar amounts separated by a hyphen ('-')
            # row[0] will be values such as: '$1-$39'
            [start, end] = line[0].split("-")
            # Make sure we've got data in the right shape
            assert start.startswith("$")
            assert end.startswith("$")
            assert should_keep(line[0])
            # Strip prefixes
            start = start[1:]
            end = end[1:]
            # Downcast and calculate midpoint
            start = int(start)
            end = int(end)
            incomes.append((start + end + 1) // 2)
        return incomes
    def population_data(self):
        populations = []
        for line in self.filter_income_brackets():
            # We don't care about income brackets, only population data
            columns = self.extract_income_columns(line)[1:]
            columns_for_line = []
            for column in columns:
                columns_for_line.append(int(column))
            populations.append(columns_for_line)
        return populations

Replacement in project.py at line 324 [4.35]

B:BD[4.4189] → [4.4189:4269]

    bad_words = ["Neg_Ni_inc", "4000more", "R650more", "R_NS", "R_Tot", "PI_S"]

[4.4189]

[4.4269]

    bad_words = [
        "Neg_Ni_inc",
        "4000more",
        "R650more",
        "R_NS",
        "R_Tot",
        "PI_S",
        "or more",
        "Notstated",
    ]

Replacement in project.py at line 447 [4.35]

B:BD[3.1010] → [2.430:498]

parts = merge_time_series_files(STATE_TIME_SERIES)
print(parts[-1])

[3.1010]

[4.7984]

# parts = merge_time_series_files(STATE_TIME_SERIES)
data = Census1996("Data/1996/1996_income_by_rent.csv")