Refactor time-series code

finchie
Sep 9, 2023, 9:16 AM
JGCJ2CQPYNRNLDQGVRIDIB5EGKW2O7PCTAIHKZTTJQXE2VTY27QQC

Dependencies

  • [2] NMT3JJPP Add function to find median
  • [3] 2AE4VE7U Add support for 2001 census
  • [4] QU7MDQED Add basic support for 1996 census data
  • [5] 4YKXEBAV Reduce inconsistencies between 1996 and 2001 objects
  • [6] QDEBWR2R Add missing state time series data
  • [7] YXYO3B7J Restructure filesystem
  • [8] FCHAPZLD Implement simpler file merging function
  • [9] 7ZN6HHL2 Extract relevant census data

Change contents

  • edit in project.py at line 80
    [3.768]
    [3.768]
    # TODO: assert table is rectangular
  • edit in project.py at line 101
    [3.1171]
    [3.437]
    # Simple function that says whether or not to use a category
    def should_keep(category):
    # If any of these phrases are in the category, it's not allowed!
    bad_words = [
    "Neg_Ni_inc", # Negative/nil income
    "more", # Brackets over a certain threshold (eg $4000more)
    "or more",
    "NS", # Not stated
    "Notstated",
    "Tot", # Total
    "PI_S",
    ]
    for bad_word in bad_words:
    if bad_word in category:
    # Bad word! Tell them not to use this category
    return False
    # Made it to the end without any bad words, should be fine
    return True
    # Take a valid category and return the year, mean income & rent
    def split_category(category):
    # Example category: 'C11_1_149_R1_74'
    parts = category.split("_")
    # Make sure it's a valid category
    if not should_keep(category):
    print(f"Invalid category: {category}")
    assert should_keep(category) is True
    # All valid categories have 5 parts
    assert len(parts) == 5
    # Split up the parts
    year = parts[0]
    min_income = int(parts[1])
    max_income = int(parts[2])
    max_rent = int(parts[4])
    # Year starts with 'C' followed by 2 digits
    assert len(year) == 3
    assert year[0] == "C"
    assert year[1:].isdigit()
    # `min_rent` is special because it has the 'R' prefix
    min_rent = parts[3]
    # Make sure the first character is an 'R'
    assert min_rent[0] == "R"
    # Then we can just convert it into an int like the others
    min_rent = int(min_rent[1:])
    mean_income = (min_income + max_income + 1) // 2
    mean_rent = (min_rent + max_rent + 1) // 2
    return (year, mean_income, mean_rent)
  • replacement in project.py at line 335
    [3.3959][3.251:337](),[3.5784][3.251:337](),[3.251][3.251:337]()
    def merge_files(files):
    # Set up our variables
    categories = []
    years = {}
    [3.3959]
    [3.337]
    class TimeSeriesBase(CensusDataset):
    def __init__(self, dataset):
    self.categories = dataset[0]
    self.data = dataset[1:]
  • replacement in project.py at line 340
    [3.338][3.338:1186]()
    for filename in files:
    file = open(filename, "r").readlines()
    # Split each row on the comma
    file = [[col.strip() for col in row.split(",")] for row in file]
    # First line in file (file[0]) is headings
    first_line = file[0]
    # Don't need the first column, as it says 'STE_CODE_2021'
    # Example value for `categories_for_file`:
    # ['C11_Neg_Ni_inc_R1_74', 'C11_Neg_Ni_inc_R75_99', 'C11_Neg_Ni_inc_R100_149', etc]
    categories_for_file = first_line[1:]
    # When we need to use categories, use `categories_for_file`
    # The `file` variable should be JUST data, so remove the first line
    # Example value:
    """
    [
    ['1', '400', '505', etc],
    ['2', '241', '297', etc],
    etc
    ]
    """
    file = file[1:]
    [3.338]
    [3.1186]
    def median_incomes(self):
    incomes = {}
    for column in range(len(self.categories)):
    category = self.categories[column]
  • replacement in project.py at line 345
    [3.1187][3.1187:1588]()
    # Get a list of all the areas (states, could later be LGAs)
    # Should end up looking like:
    # ['1', '2', '3', etc]
    areas = []
    for row in range(len(file)):
    # Area code is the first column of each row
    # Example value: '1'
    first_cell = file[row][0]
    # Add that to list of all areas
    areas.append(first_cell)
    [3.1187]
    [3.1588]
    total_income = 0
    for row in range(len(self.data)):
    total_income += int(self.data[row][column])
  • replacement in project.py at line 349
    [3.1589][3.1589:1673]()
    # Remove this column from the row
    file[row] = file[row][1:]
    [3.1589]
    [3.1673]
    income = category[1]
    if income not in incomes:
    incomes[income] = 0
    incomes[income] += total_income
  • replacement in project.py at line 354
    [3.1674][3.1674:1898]()
    # Go through ALL the data in the file column-by-column
    for column in range(len(categories_for_file)):
    # Example category: 'C11_Neg_Ni_inc_R1_74'
    category = categories_for_file[column]
    [3.1674]
    [3.1898]
    return incomes
  • replacement in project.py at line 356
    [3.1899][3.1899:2466]()
    # Example value: 'C11_Neg_Ni_inc_R1_74'
    # Split the category up into parts (separated by underscores)
    category = category.split("_")
    # Example value for category: ['C11', 'Neg', 'Ni', etc]
    # Year is always prefixed, so get everything before first underscore
    # Example value: 'C11' (first item in category)
    year = category[0]
    # Category is everything after the first underscore
    # Example value: 'Neg_Ni_inc_R1_74'
    category = "_".join(category[1:])
    [3.1899]
    [3.2466]
    def median_rents(self):
    rents = {}
    for column in range(len(self.categories)):
    category = self.categories[column]
  • replacement in project.py at line 361
    [3.2467][3.2467:2717]()
    # Add that category to the list of categories (without year)
    # Example value: 'Neg_Ni_inc_R1_74' (no year included!!)
    # TODO: assert all categories are the same across years
    categories.append(category)
    [3.2467]
    [3.2717]
    total_rent = 0
    for row in range(len(self.data)):
    total_rent += int(self.data[row][column])
  • replacement in project.py at line 365
    [3.2718][3.2718:3019]()
    # Set up the year if none exists
    # Example year: 'C11'
    if year not in years:
    years[year] = {}
    # Should look like:
    """
    {
    'C11': {},
    }
    """
    [3.2718]
    [3.3019]
    rent = category[0]
    if rent not in rents:
    rents[rent] = 0
    rents[rent] += total_rent
  • replacement in project.py at line 370
    [3.3020][3.3020:3579]()
    # Now add every row in this column
    for row in range(len(file)):
    # years[year] is a dict mapping states to rows
    # Every key is essentially one row of data in the csv
    # It will end up looking something like:
    """
    {
    '1': [1, 2, 3, etc],
    '2': [3, 1, 2, etc],
    etc
    }
    """
    # Example value: first row (row = 0), area = '1'
    area = areas[row]
    [3.3020]
    [3.3579]
    return rents
  • edit in project.py at line 372
    [3.3580][3.3580:3761]()
    # If the area hasn't been seen before
    if area not in years[year]:
    # Add an empty list
    years[year][area] = []
  • replacement in project.py at line 373
    [3.3762][3.3762:3998]()
    # This is the very big long row for the current year
    current_cell = file[row][column]
    # Add the current cell to the relevant dataset
    years[year][area].append(current_cell)
    [3.3762]
    [3.3998]
    class Census2011(TimeSeriesBase):
    def year(self):
    return 2011
  • edit in project.py at line 377
    [3.3999][3.3999:4030]()
    return (categories, years)
  • edit in project.py at line 378
    [3.4031]
    [3.4031]
    class Census2016(TimeSeriesBase):
    def year(self):
    return 2016
  • edit in project.py at line 382
    [3.4032][3.4032:4189](),[3.4189][3.5785:5960](),[3.5960][3.4269:4861](),[3.4269][3.4269:4861]()
    # Simple function that says whether or not to use a category
    def should_keep(category):
    # If any of these phrases are in the category, it's not allowed!
    bad_words = [
    "Neg_Ni_inc",
    "4000more",
    "R650more",
    "R_NS",
    "R_Tot",
    "PI_S",
    "or more",
    "Notstated",
    ]
    for bad_word in bad_words:
    if bad_word in category:
    # Bad word! Tell them not to use this category
    return False
    # Made it to the end without any bad words, should be fine
    return True
    # Take a valid category and return the mean income & rent
    def split_category(category):
    # Example category: '1_149_R1_74'
    parts = category.split("_")
    # Make sure it's a valid category
    if not should_keep(category):
    print(category)
    assert should_keep(category) is True
    # All valid categories have 4 parts
    assert len(parts) == 4
  • replacement in project.py at line 383
    [3.4862][3.4862:4978]()
    # Split up the parts
    min_income = int(parts[0])
    max_income = int(parts[1])
    max_rent = int(parts[3])
    [3.4862]
    [3.4978]
    class Census2021(TimeSeriesBase):
    def year(self):
    return 2021
  • edit in project.py at line 387
    [3.4979][3.4979:5232]()
    # `min_rent` is special because it has the 'R' prefix
    min_rent = parts[2]
    # Make sure the first character is an 'R'
    assert min_rent[0] == "R"
    # Then we can just convert it into an int like the others
    min_rent = int(min_rent[1:])
  • replacement in project.py at line 388
    [3.5233][3.5233:6861]()
    mean_income = round((min_income + max_income) / 2)
    mean_rent = round((min_rent + max_rent) / 2)
    return (mean_income, mean_rent)
    # Second step: split 2-variable categories into table
    def reshape_into_table(categories, years):
    # Example value: 'C11'
    for year in years:
    # Example value: '1'
    for area in years[year]:
    # TODO: categories may change between iterations
    # This should look like:
    """
    [
    # This is income of $1-$149
    [
    # This is rent of $1-$74
    1535,
    # This is rent of $75-$99
    1580,
    etc
    ],
    # This is income of $150-$299
    [
    # This is rent of $1-$74
    4591,
    # This is rent of $75-$99
    2188,
    etc
    ],
    etc
    ]
    """
    new_data = []
    # The last mean income we saw
    last_income = None
    # Make sure there are as many categories as data
    assert len(years[year][area]) == len(categories)
    # Go through each 2-variable category and data pair
    for column in range(len(categories)):
    # Get the relevant category
    category = categories[column]
    # Check if the category is part of our dataset
    if should_keep(category) is True:
    income, rent = split_category(category)
    [3.5233]
    [3.6861]
    class Census2011_2016_2021(CensusDataset):
    def __init__(self, files):
    census2011 = None
    census2016 = None
    census2021 = None
  • replacement in project.py at line 394
    [3.6862][3.6862:7059]()
    # Check if we've hit a new income bracket
    if last_income != income:
    new_data.append([])
    last_income = income
    [3.6862]
    [3.7059]
    for file_index in range(len(files)):
    file = open(files[file_index], "r").read()
    file = tabulate(file)
  • replacement in project.py at line 398
    [3.7060][3.7060:7382]()
    # Append our rent information to the most recent bracket
    # The most recent bracket will always be the one we want
    current_cell = years[year][area][column]
    current_cell = int(current_cell)
    new_data[-1].append(current_cell)
    [3.7060]
    [3.7382]
    # First-time setup requires opening the first file
    if file_index == 0:
    census2011 = [[] for row in range(len(file))]
    census2016 = [[] for row in range(len(file))]
    census2021 = [[] for row in range(len(file))]
  • replacement in project.py at line 404
    [3.7383][3.7383:7469]()
    # Update this from old->new data
    years[year][area] = new_data
    [3.7383]
    [3.7469]
    # Headings are the first row
    headings = file[0]
    # Make sure to skip the first column (STE_CODE_2021)
    for column in range(1, len(file[0])):
    if should_keep(headings[column]):
    (year, mean_income, mean_rent) = split_category(headings[column])
  • replacement in project.py at line 411
    [3.7470][3.7470:7798]()
    incomes = []
    rents = []
    for category in categories:
    if should_keep(category):
    (mean_income, mean_rent) = split_category(category)
    if mean_income not in incomes:
    incomes.append(mean_income)
    if mean_rent not in rents:
    rents.append(mean_rent)
    [3.7470]
    [3.7798]
    # Add each cell to its corresponding row
    for row in range(len(file)):
    if row == 0:
    cell = (mean_income, mean_rent)
    else:
    cell = file[row][column]
  • replacement in project.py at line 418
    [3.7799][3.7799:7836]()
    return ((incomes, rents), years)
    [3.7799]
    [3.7836]
    if year == "C11":
    census2011[row].append(cell)
    elif year == "C16":
    census2016[row].append(cell)
    elif year == "C21":
    census2021[row].append(cell)
    else:
    raise ValueError("Unsupported census year")
  • edit in project.py at line 427
    [3.7837]
    [3.7837]
    # Make sure headings are consistent across years
    assert census2011[0] == census2016[0] == census2021[0]
  • replacement in project.py at line 430
    [3.7838][3.314:429](),[3.429][3.989:1009](),[3.7966][3.989:1009]()
    # (categories, years) = merge_files([TIME_SERIES_A])
    # (categories, years) = reshape_into_table(categories, years)
    # print(categories)
    [3.7838]
    [3.1009]
    self.census2011 = Census2011(census2011)
    self.census2016 = Census2016(census2016)
    self.census2021 = Census2021(census2021)
  • replacement in project.py at line 434
    [3.1010][3.5961:6014]()
    # parts = merge_time_series_files(STATE_TIME_SERIES)
    [3.1010]
    [2.0]
    def get_year(self, year):
    if year == 2011:
    return self.census2011
    elif year == 2016:
    return self.census2016
    elif year == 2021:
    return self.census2021
    else:
    raise ValueError("Unsupported year")
  • replacement in project.py at line 503
    [2.1842][3.1945:2089](),[3.3983][3.1945:2089]()
    print(Census1996().median_incomes())
    print(Census1996().median_rents())
    print(Census2001().median_incomes())
    print(Census2001().median_rents())
    [2.1842]
    [2.1843]
    # print(Census1996().median_incomes())
    # print(Census1996().median_rents())
    # print(Census2001().median_incomes())
    # print(Census2001().median_rents())
  • replacement in project.py at line 508
    [2.1844][2.1844:1936]()
    census = Census2001().median_incomes()
    for key in census:
    print(f"{key},{census[key]}")
    [2.1844]
    [2.1936]
    # census = Census2001().median_incomes()
    # for key in census:
    # print(f"{key},{census[key]}")
    # print(find_median(census))
  • replacement in project.py at line 514
    [2.1937][2.1937:1964]()
    print(find_median(census))
    [2.1937]
    [3.7984]
    # print(merge_files(STATE_TIME_SERIES)[0])
    time_series = Census2011_2016_2021(STATE_TIME_SERIES)
    print(time_series.get_year(2011).median_rents())