This is the first dataset that uses classes and all the others will probably migrate over
QU7MDQED456DFUKHLHCS2SS5QSLDYFU233YNQ27KY2TOVDMTSUKAC
def remove_quoted_commas(line):
inside_quote = False
cleaned_quote = ""
for character in line:
if character == '"':
inside_quote = not inside_quote
elif character == ",":
if not inside_quote:
cleaned_quote += character
else:
cleaned_quote += character
# Make sure no quotes are unclosed
assert not inside_quote
return cleaned_quote
# Base dataset functionality, can be used to see broad historical trends from 1996-2021
class CensusDataset:
def census_year(self):
raise NotImplementedError
def median_incomes(self):
raise NotImplementedError
def median_rents(self):
raise NotImplementedError
class Census1996(CensusDataset):
def __init__(self, filename):
file = open(filename, "r").read()
# Split the file up into sections
self.sections = []
# Each section is separated by this string
for section in file.split(",,,,,,,,,,,,,"):
section = section.strip()
# Some lines are just section breaks, so ignore those
if section != "":
self.sections.append(section)
self.WEEKLY_RENT_HEADING = self.sections[4]
self.POPULATION_BY_INCOME = self.sections[5]
# Extract the median rent (midpoint between range) for each column
median_rents = self.median_rents()
# Extract the median household income (midpoint between each range) for each bracket
median_incomes = self.median_incomes()
# Extract the actual table
population_data = self.population_data()
from pprint import pprint
pprint(population_data)
def census_year(self):
return 1996
def filter_rent_brackets(self):
# Remove commas inside of quotes as to not break parsing
weekly_rent_heading = remove_quoted_commas(self.WEEKLY_RENT_HEADING)
assert weekly_rent_heading.startswith(",,,,,, Weekly rent,,,,,,,")
assert weekly_rent_heading.count("\n") == 2
# The headings are split between 2 lines (using example $0-$99):
# 1st line: start of range (eg $0-)
# 2nd line: end of range (eg $99)
[start_ranges, end_ranges] = weekly_rent_heading.splitlines()[1:]
# Remove comma prefix and split into columns
start_ranges = start_ranges[1:].split(",")
end_ranges = end_ranges[1:].split(",")
# Remove the last 3 columns:
# 1. $1000 or more
# 2. Not stated
# 3. Total
assert start_ranges[-3:] == ["$1000", "Not", ""]
assert end_ranges[-3:] == ["or more", "stated", "Total"]
start_ranges = start_ranges[:-3]
end_ranges = end_ranges[:-3]
return (start_ranges, end_ranges)
def parse_rent_bracket(self, start, end):
assert start.startswith("$")
assert start.endswith("-")
assert end.startswith("$")
# Strip '$' prefix and '-' suffix
start = start[1:-1]
# Strip '$' prefix
end = end[1:]
# Make sure we have filtered out bad categories such as 'Not stated'
assert should_keep(start + end)
# Downcast ranges and calculate midpoint
start = int(start)
end = int(end)
return (start, end)
def median_rents(self):
(start_ranges, end_ranges) = self.filter_rent_brackets()
# Extract the ranges and store as the middle point (eg $50 for $0-$99)
rents = []
for column in range(len(start_ranges)):
start = start_ranges[column]
end = end_ranges[column]
(start, end) = self.parse_rent_bracket(start, end)
rents.append((start + end + 1) // 2)
return rents
def filter_income_brackets(self):
rows = self.POPULATION_BY_INCOME.splitlines()
# Remove rows containing irrelevant data
assert rows[0].startswith("Negative income")
assert rows[1].startswith("Nil income")
assert rows[-3].startswith('"$2,000 or more"')
assert rows[-2].startswith("Partial income stated(a)")
assert rows[-1].startswith("All incomes not stated(b)")
rows = rows[2:-3]
return rows
def extract_income_columns(self, line):
# Convert '"$1,000"' -> '$1000'
line = remove_quoted_commas(line)
# Split into columns
line = line.split(",")
return line
def median_incomes(self):
incomes = []
for line in self.filter_income_brackets():
line = self.extract_income_columns(line)
# Two dollar amounts separated by a hyphen ('-')
# row[0] will be values such as: '$1-$39'
[start, end] = line[0].split("-")
# Make sure we've got data in the right shape
assert start.startswith("$")
assert end.startswith("$")
assert should_keep(line[0])
# Strip prefixes
start = start[1:]
end = end[1:]
# Downcast and calculate midpoint
start = int(start)
end = int(end)
incomes.append((start + end + 1) // 2)
return incomes
def population_data(self):
populations = []
for line in self.filter_income_brackets():
# We don't care about income brackets, only population data
columns = self.extract_income_columns(line)[1:]
columns_for_line = []
for column in columns:
columns_for_line.append(int(column))
populations.append(columns_for_line)
return populations