Add support for 2001 census

finchie
Sep 7, 2023, 1:47 PM
2AE4VE7UAQOEXBTOWOSXSEC2Q5PII4PNZH6MZFCLGNY3LGJOGKIAC

Dependencies

  • [2] QU7MDQED Add basic support for 1996 census data
  • [3] FCHAPZLD Implement simpler file merging function
  • [4] 7ZN6HHL2 Extract relevant census data
  • [*] QDEBWR2R Add missing state time series data

Change contents

  • edit in project.py at line 1
    [3.35]
    [6.0]
    DATA_DIRECTORY = "Data"
  • edit in project.py at line 53
    [2.436]
    [2.436]
    def extract_sections(relative_path, separator):
    file = open(f"{DATA_DIRECTORY}/{relative_path}", "r").read()
    # Split the file up into sections
    sections = []
    # Each section is separated by this string
    for section in file.split(separator):
    section = section.strip()
    # Some lines are just section breaks, so ignore those
    if section != "":
    sections.append(section)
    return sections
    # TODO: use this everywhere
    def tabulate(text):
    rows = text.splitlines()
    for row in range(len(rows)):
    rows[row] = rows[row].split(",")
    # Remove any extraneous whitespace
    for column in range(len(rows[row])):
    rows[row][column] = rows[row][column].strip()
    return rows
    def parse_financial_bracket(start, end):
    assert start.startswith("$")
    assert end.startswith("$")
    # Strip '$' prefix
    start = start[1:]
    end = end[1:]
    # Make sure we have filtered out bad categories such as 'Not stated'
    assert should_keep(start + end)
    # Downcast ranges from str -> int
    start = int(start)
    end = int(end)
  • edit in project.py at line 99
    [2.437]
    [2.437]
    return (start, end)
  • edit in project.py at line 101
    [2.438]
    [2.438]
  • replacement in project.py at line 107
    [2.609][2.609:639]()
    def median_incomes(self):
    [2.609]
    [2.639]
    def median_rents(self):
  • replacement in project.py at line 110
    [2.674][2.674:702]()
    def median_rents(self):
    [2.674]
    [2.702]
    def median_incomes(self):
  • replacement in project.py at line 115
    [2.771][2.771:1200]()
    def __init__(self, filename):
    file = open(filename, "r").read()
    # Split the file up into sections
    self.sections = []
    # Each section is separated by this string
    for section in file.split(",,,,,,,,,,,,,"):
    section = section.strip()
    # Some lines are just section breaks, so ignore those
    if section != "":
    self.sections.append(section)
    [2.771]
    [2.1200]
    def __init__(self):
    sections = extract_sections("1996/1996_income_by_rent.csv", ",,,,,,,,,,,,,")
  • replacement in project.py at line 118
    [2.1201][2.1201:1306]()
    self.WEEKLY_RENT_HEADING = self.sections[4]
    self.POPULATION_BY_INCOME = self.sections[5]
    [2.1201]
    [2.1306]
    self.WEEKLY_RENT_HEADING = remove_quoted_commas(sections[4])
    self.POPULATION_BY_INCOME = remove_quoted_commas(sections[5])
  • replacement in project.py at line 136
    [2.1802][2.1802:2075]()
    # Remove commas inside of quotes as to not break parsing
    weekly_rent_heading = remove_quoted_commas(self.WEEKLY_RENT_HEADING)
    assert weekly_rent_heading.startswith(",,,,,, Weekly rent,,,,,,,")
    assert weekly_rent_heading.count("\n") == 2
    [2.1802]
    [2.2075]
    assert self.WEEKLY_RENT_HEADING.startswith(",,,,,, Weekly rent,,,,,,,")
    assert self.WEEKLY_RENT_HEADING.count("\n") == 2
  • replacement in project.py at line 142
    [2.2235][2.2235:2460]()
    [start_ranges, end_ranges] = weekly_rent_heading.splitlines()[1:]
    # Remove comma prefix and split into columns
    start_ranges = start_ranges[1:].split(",")
    end_ranges = end_ranges[1:].split(",")
    [2.2235]
    [3.196]
    [start_ranges, end_ranges] = tabulate(self.WEEKLY_RENT_HEADING)[1:]
    # Remove comma prefix
    start_ranges = start_ranges[1:]
    end_ranges = end_ranges[1:]
  • edit in project.py at line 162
    [2.2922][2.2922:3443]()
    def parse_rent_bracket(self, start, end):
    assert start.startswith("$")
    assert start.endswith("-")
    assert end.startswith("$")
    # Strip '$' prefix and '-' suffix
    start = start[1:-1]
    # Strip '$' prefix
    end = end[1:]
    # Make sure we have filtered out bad categories such as 'Not stated'
    assert should_keep(start + end)
    # Downcast ranges and calculate midpoint
    start = int(start)
    end = int(end)
    return (start, end)
  • replacement in project.py at line 170
    [2.3761][2.3761:3824]()
    (start, end) = self.parse_rent_bracket(start, end)
    [2.3761]
    [2.3824]
    # Strip hypen suffix from `start`
    start = start[:-1]
    (start, end) = parse_financial_bracket(start, end)
  • edit in project.py at line 191
    [2.4369][2.4369:4577]()
    def extract_income_columns(self, line):
    # Convert '"$1,000"' -> '$1000'
    line = remove_quoted_commas(line)
    # Split into columns
    line = line.split(",")
    return line
  • replacement in project.py at line 195
    [2.4630][2.4630:4734]()
    for line in self.filter_income_brackets():
    line = self.extract_income_columns(line)
    [2.4630]
    [2.4734]
    for row in self.filter_income_brackets():
  • replacement in project.py at line 198
    [2.4849][2.4849:4895]()
    [start, end] = line[0].split("-")
    [2.4849]
    [2.4895]
    [start, end] = row[0].split("-")
  • replacement in project.py at line 203
    [2.5034][2.5034:5074]()
    assert should_keep(line[0])
    [2.5034]
    [2.5074]
    assert should_keep(row[0])
  • edit in project.py at line 230
    [2.5782]
    [2.5782]
    class Census2001(CensusDataset):
    def __init__(self):
    income_sections = extract_sections("2001/Income_2001.csv", ",,,")
    self.INCOME_DATA = remove_quoted_commas(income_sections[5])
    rent_sections = extract_sections("2001/Rent_2001.csv", ",,,,,")
    self.RENT_DATA = remove_quoted_commas(rent_sections[5])
    def census_year(self):
    return 2001
    def filtered_rent_brackets(self):
    rows = tabulate(self.RENT_DATA)
    assert rows[-2][0] == "$500 or more"
    assert rows[-1][0] == "Not stated"
    rows = rows[:-2]
    brackets = []
    for row in rows:
    brackets.append(row[0])
    return brackets
    def median_rents(self):
    rents = self.filtered_rent_brackets()
    for rent in range(len(rents)):
    # Rent brackets are 2 dollar amounts separated by a hyphen ('-')
    # Example rent bracket (rents[rent]): $1-$49
    [start, end] = rents[rent].split("-")
    (start, end) = parse_financial_bracket(start, end)
    rents[rent] = (start + end + 1) // 2
    return rents
  • edit in project.py at line 268
    [2.5783]
    [2.5783]
    def filtered_income_brackets(self):
    rows = tabulate(self.INCOME_DATA)
  • edit in project.py at line 271
    [2.5784]
    [3.251]
    assert rows[0][0] == "Negative/Nil income"
    assert rows[-3][0] == "$2000 or more"
    assert rows[-2][0] == "Partial income stated(b)"
    assert rows[-1][0] == "All incomes not stated(c)"
    rows = rows[1:-3]
    incomes = []
    for row in rows:
    incomes.append(row[0])
    return incomes
    def median_incomes(self):
    incomes = self.filtered_income_brackets()
    for income in range(len(incomes)):
    [start, end] = incomes[income].split("-")
    (start, end) = parse_financial_bracket(start, end)
    incomes[income] = (start + end + 1) // 2
    return incomes
  • replacement in project.py at line 523
    [2.6014][2.6014:6069]()
    data = Census1996("Data/1996/1996_income_by_rent.csv")
    [2.6014]
    [3.7984]
    # data = Census1996()
    data = Census2001().median_incomes()