Dutch schools dataset construction

This article is the third part of dataset construction. Here I want to add new data into dataset I got on previous steps. This time it will be info students who wasn’t able to make it on time and stayed in primary school longer then others

All the data for this part of dataset construction I got here: https://duo.nl/open_onderwijsdata/databestanden/po/leerlingen-po/bo-sbo/zittenblijven.jsp

3. Students who stayed in primary school longer.

Read .csv file of new data:

import pandas as pd 
data_raw = pd.read_csv("input_data/Students_stayed_longer_2013_2019.csv", error_bad_lines=False, sep=';', encoding = "ISO-8859-1") 

print(data_raw.shape)
print(data_raw.columns)
data_raw.head() 

(6910, 39)
Index(['BRIN_NUMMER', 'VESTIGINGSNUMMER', 'INSTELLINGSNAAM_VESTIGING',
       'POSTCODE_VESTIGING', 'PLAATSNAAM', 'GEMEENTENUMMER', 'GEMEENTENAAM',
       'PROVINCIE', 'SOORT_PO', 'DENOMINATIE_VESTIGING',
       'BEVOEGD_GEZAG_NUMMER', 'LJ8_13', 'ZIT_13', 'ZITPERC_13',
       'ZITPERC_LJ_13', 'LJ8_14', 'ZIT_14', 'ZITPERC_14', 'ZITPERC_LJ_14',
       'LJ8_15', 'ZIT_15', 'ZITPERC_15', 'ZITPERC_LJ_15', 'LJ8_16', 'ZIT_16',
       'ZITPERC_16', 'ZITPERC_LJ_16', 'LJ8_17', 'ZIT_17', 'ZITPERC_17',
       'ZITPERC_LJ_17', 'LJ8_18', 'ZIT_18', 'ZITPERC_18', 'ZITPERC_LJ_18',
       'LJ8_19', 'ZIT_19', 'ZITPERC_19', 'ZITPERC_LJ_19'],
      dtype='object')

	BRIN_NUMMER	INSTELLINGSNAAM_VESTIGING	POSTCODE_VESTIGING	PLAATSNAAM	GEMEENTENUMMER	GEMEENTENAAM	PROVINCIE	SOORT_PO	DENOMINATIE_VESTIGING	...	ZITPERC_17	ZITPERC_LJ_17	LJ8_18	ZIT_18	ZITPERC_18	ZITPERC_LJ_18	LJ8_19	ZIT_19	ZITPERC_19	ZITPERC_LJ_19
0	00AP	Wereldwijs	2716PH	ZOETERMEER	637	Zoetermeer	Zuid-Holland	Bo	Gereformeerd vrijgemaakt	...	,181818181818182	,0227272727272727	11	1	,0909090909090909	,0113636363636364	8	2	,25	,03125
1	00AR	BS "De Maasparel"	6107AW	STEVENSWEERT	1641	Maasgouw	Limburg	Bo	Rooms-Katholiek	...	,0588235294117647	,00735294117647059	17	1	,0588235294117647	,00735294117647059	15	0	0	0
2	00AV	De Morgenster	3201CN	SPIJKENISSE	1930	Nissewaard	Zuid-Holland	Bo	Gereformeerd vrijgemaakt	...	0	0	12	1	,0833333333333333	,0104166666666667	7	0	0	0
3	00AZ	De Stapsteen	2971AR	BLESKENSGRAAF CA	1978	Molenlanden	Zuid-Holland	Bo	Openbaar	...	,25	,03125	14	2	,142857142857143	,0178571428571429	11	1	,0909090909090909	,0113636363636364
4	00BA	OBS De Klimboom	6666EB	HETEREN	1734	Overbetuwe	Gelderland	Bo	Openbaar	...	0	0	15	0	0	0	11	0	0	0

5 rows × 39 columns

In this dataset I see that there are a lot of columns like ZIT_YEAR, ZITPERC_YEAR, ZITPERC_LJ_YEAR, LJ8_YEAR, so I would like to get this YEAR part and transform this columns to rows with specific year value and only 3 columns: ZIT, ZITPERC, ZITPERC_LJ and LJ8. It will need a lot of transformations, so let’s start with creating few separate dataset for each year:

def convert_to_numeric(dataframe, column):
    dataframe[column] = dataframe[column].astype(str).str.replace(',','.')
    dataframe[column] = dataframe[column].astype(str).str.replace(r'^.','0.')
    dataframe[column] = pd.to_numeric(dataframe[column])
    
def get_year_dataset(year):
    data = (data_raw.copy())[['BRIN_NUMMER', 'VESTIGINGSNUMMER', 
                              'LJ8_' + year, 'ZIT_' + year,
                              'ZITPERC_' + year]]
    data['SCHOOL_ID'] = data["VESTIGINGSNUMMER"].astype(str) + data["BRIN_NUMMER"] + "_20" + year
    data.rename(columns = {'LJ8_' + year: 'LJ8',
                           'ZIT_' + year: 'ZIT',
                           'ZITPERC_' + year: 'ZITPERC'},
                inplace = True)
    data.drop('VESTIGINGSNUMMER', 1, inplace = True)
    data.drop('BRIN_NUMMER', 1, inplace = True)
    convert_to_numeric(data, 'ZITPERC')
    data['ZITPERC'] = data['ZITPERC'] * 100
    return data


data_2014 = get_year_dataset('14')
data_2015 = get_year_dataset('15')
data_2016 = get_year_dataset('16')
data_2017 = get_year_dataset('17')
data_2018 = get_year_dataset('18')

data_2014.head()
data_2014.dtypes

LJ8            int64
ZIT            int64
ZITPERC      float64
SCHOOL_ID     object
dtype: object

And merge it together:

data_frames = [data_2014, data_2015, data_2016, data_2017, data_2018]
data_second_year = pd.concat(data_frames).fillna(0)
data_second_year.reset_index(drop=True, inplace=True)
print(data_2014.shape)
print(data_2015.shape)
print(data_2016.shape)
print(data_2017.shape)
print(data_2017.shape)
print(data_2018.shape)
print(data_second_year.shape)
data_second_year

(6910, 4)
(6910, 4)
(6910, 4)
(6910, 4)
(6910, 4)
(6910, 4)
(34550, 4)

	LJ8	ZIT	ZITPERC	SCHOOL_ID
0	11	0	0.000000	000AP_2014
1	18	1	5.555556	000AR_2014
2	3	1	33.333333	000AV_2014
3	24	3	12.500000	000AZ_2014
4	12	4	33.333333	000BA_2014
...	...	...	...	...
34545	0	0	0.000000	031ML_2018
34546	0	0	0.000000	031MM_2018
34547	0	0	0.000000	031MN_2018
34548	0	0	0.000000	031MV_2018
34549	0	0	0.000000	031MX_2018

34550 rows × 4 columns

Let’s check missing and unexpected values:

print(data_second_year.dtypes)

LJ8            int64
ZIT            int64
ZITPERC      float64
SCHOOL_ID     object
dtype: object

lj8_zero = data_second_year[data_second_year['LJ8'] == 0]
print(lj8_zero.shape)
lj8_zero.head()

(2282, 4)

	LJ8	ZIT	ZITPERC	SCHOOL_ID
61	0	0	0.0	003AX_2014
69	0	0	0.0	003BW_2014
75	0	0	0.0	003CS_2014
124	0	0	0.0	003HM_2014
172	0	0	0.0	003LC_2014

It looks like we have 2282 values with total amount of students in class 8 is 0. It could be wrong data or mistake, but anyway this rows says us nothing, so we can remove it:

data_second_year.drop(lj8_zero.index, 0, inplace = True)
print(data_second_year.shape)
data_second_year.head()

(32268, 4)

	LJ8	ZIT	ZITPERC	SCHOOL_ID
0	11	0	0.000000	000AP_2014
1	18	1	5.555556	000AR_2014
2	3	1	33.333333	000AV_2014
3	24	3	12.500000	000AZ_2014
4	12	4	33.333333	000BA_2014

So now let’s read dataset from previous step, add our dataset there and write it to down to final file:

data_score = pd.read_csv("score_second_result.csv", error_bad_lines=False, sep=',', encoding = "ISO-8859-1") 

print(data_score.shape)
print(data_score.columns)
data_score.head()

(31300, 33)
Index(['SCHOOL_ID', 'DATUM', 'INSTELLINGSNAAM_VESTIGING', 'POSTCODE_VESTIGING',
       'PLAATSNAAM', 'GEMEENTENUMMER', 'GEMEENTENAAM', 'PROVINCIE', 'SOORT_PO',
       'DENOMINATIE_VESTIGING', 'BEVOEGD_GEZAG_NUMMER', 'EXAMEN',
       'EXAMEN_AANTAL', 'EXAMEN_GEM', 'REKENEN_LAGER1F', 'REKENEN_1F',
       'REKENEN_1S', 'REKENEN_2F', 'LV_LAGER1F', 'LV_1F', 'LV_2F',
       'TV_LAGER1F', 'TV_1F', 'TV_2F', 'VSO', 'PRO', 'VMBO', 'VMBO_HAVO',
       'HAVO', 'HAVO_VWO', 'VWO', 'ADVIES_NIET_MOGELIJK', 'TOTAAL_ADVIES'],
      dtype='object')

	SCHOOL_ID	DATUM	INSTELLINGSNAAM_VESTIGING	POSTCODE_VESTIGING	PLAATSNAAM	GEMEENTENUMMER	GEMEENTENAAM	PROVINCIE	SOORT_PO	DENOMINATIE_VESTIGING	...	VSO	PRO	VMBO	VMBO_HAVO	HAVO	HAVO_VWO	VWO	TOTAAL_ADVIES
0	000AP_2015	2015	De Schanskorf	2715BT	ZOETERMEER	637	Zoetermeer	Zuid-Holland	Bo	Gereformeerd vrijgemaakt	...	0	0	1	0	5	1	1	11
1	000AR_2015	2015	BS "De Maasparel"	6109AM	OHE EN LAAK	1641	Maasgouw	Limburg	Bo	Rooms-Katholiek	...	0	0	1	0	7	0	6	18
2	000AZ_2015	2015	De Kiezel en de Kei	2971AR	BLESKENSGRAAF CA	1927	Molenwaard	Zuid-Holland	Bo	Openbaar	...	0	1	3	5	3	2	4	22
3	000BA_2015	2015	OBS De Klimboom	6666EB	HETEREN	1734	Overbetuwe	Gelderland	Bo	Openbaar	...	1	1	1	0	1	4	0	12
4	000BB_2015	2015	Obs Letterwies	9944AR	NIEUWOLDA	1895	Oldambt	Groningen	Bo	Openbaar	...	0	0	7	0	5	0	2	16

5 rows × 33 columns

data = pd.merge(data_score, data_second_year,  on='SCHOOL_ID',  how='left').fillna(0)
data.LJ8 = data.LJ8.astype(int)
data.ZIT = data.ZIT.astype(int)
print(data.shape)
print(data.columns)
data.head()

(31300, 36)
Index(['SCHOOL_ID', 'DATUM', 'INSTELLINGSNAAM_VESTIGING', 'POSTCODE_VESTIGING',
       'PLAATSNAAM', 'GEMEENTENUMMER', 'GEMEENTENAAM', 'PROVINCIE', 'SOORT_PO',
       'DENOMINATIE_VESTIGING', 'BEVOEGD_GEZAG_NUMMER', 'EXAMEN',
       'EXAMEN_AANTAL', 'EXAMEN_GEM', 'REKENEN_LAGER1F', 'REKENEN_1F',
       'REKENEN_1S', 'REKENEN_2F', 'LV_LAGER1F', 'LV_1F', 'LV_2F',
       'TV_LAGER1F', 'TV_1F', 'TV_2F', 'VSO', 'PRO', 'VMBO', 'VMBO_HAVO',
       'HAVO', 'HAVO_VWO', 'VWO', 'ADVIES_NIET_MOGELIJK', 'TOTAAL_ADVIES',
       'LJ8', 'ZIT', 'ZITPERC'],
      dtype='object')

	SCHOOL_ID	DATUM	INSTELLINGSNAAM_VESTIGING	POSTCODE_VESTIGING	PLAATSNAAM	GEMEENTENUMMER	GEMEENTENAAM	PROVINCIE	SOORT_PO	DENOMINATIE_VESTIGING	...	VMBO	VMBO_HAVO	HAVO	HAVO_VWO	VWO	TOTAAL_ADVIES	LJ8	ZIT	ZITPERC
0	000AP_2015	2015	De Schanskorf	2715BT	ZOETERMEER	637	Zoetermeer	Zuid-Holland	Bo	Gereformeerd vrijgemaakt	...	1	0	5	1	1	11	13	0	0.000000
1	000AR_2015	2015	BS "De Maasparel"	6109AM	OHE EN LAAK	1641	Maasgouw	Limburg	Bo	Rooms-Katholiek	...	1	0	7	0	6	18	17	0	0.000000
2	000AZ_2015	2015	De Kiezel en de Kei	2971AR	BLESKENSGRAAF CA	1927	Molenwaard	Zuid-Holland	Bo	Openbaar	...	3	5	3	2	4	22	19	3	15.789474
3	000BA_2015	2015	OBS De Klimboom	6666EB	HETEREN	1734	Overbetuwe	Gelderland	Bo	Openbaar	...	1	0	1	4	0	12	24	5	20.833333
4	000BB_2015	2015	Obs Letterwies	9944AR	NIEUWOLDA	1895	Oldambt	Groningen	Bo	Openbaar	...	7	0	5	0	2	16	15	2	13.333333

5 rows × 36 columns

Looking at the list of columns I can see that we probably won’t need BEVOEGD_GEZAG_NUMMER and ZITPERC columns because we can get the info in this columns from other places in dataset. I also think that we should remove GEMEENTENUMMER columns because we’ve already have GEMEENTENAAM which has the same info. Let’s remove them and then write the result dataset to the output folder:

data.drop('GEMEENTENUMMER', 1, inplace = True)
data.drop('BEVOEGD_GEZAG_NUMMER', 1, inplace = True)
data.drop('ZITPERC', 1, inplace = True)
data.columns

Index(['SCHOOL_ID', 'DATUM', 'INSTELLINGSNAAM_VESTIGING', 'POSTCODE_VESTIGING',
       'PLAATSNAAM', 'GEMEENTENAAM', 'PROVINCIE', 'SOORT_PO',
       'DENOMINATIE_VESTIGING', 'EXAMEN', 'EXAMEN_AANTAL', 'EXAMEN_GEM',
       'REKENEN_LAGER1F', 'REKENEN_1F', 'REKENEN_1S', 'REKENEN_2F',
       'LV_LAGER1F', 'LV_1F', 'LV_2F', 'TV_LAGER1F', 'TV_1F', 'TV_2F', 'VSO',
       'PRO', 'VMBO', 'VMBO_HAVO', 'HAVO', 'HAVO_VWO', 'VWO',
       'ADVIES_NIET_MOGELIJK', 'TOTAAL_ADVIES', 'LJ8', 'ZIT'],
      dtype='object')

data.to_csv(r'output/Score.csv', index = False)