In [1]:
# Load relevant libraries
import pandas as pd
import numpy as np
import matplotlib
import folium
import geocoder
from tqdm import tqdm
%pylab inline
pylab.rcParams['figure.figsize'] = (10, 8)
Populating the interactive namespace from numpy and matplotlib
In [4]:
# Import pandas
import pandas as pd

# Assign spreadsheet filename: file
file = 'imd_student_blind.xlsx'

# Load spreadsheet: xl
xl = pd.ExcelFile(file)

# Print sheet names
print(xl.sheet_names)
['Sheet1']
In [5]:
# Load a sheet into a DataFrame by index: df
df = xl.parse(0)

# Print the head of the DataFrame df
df.head()
Out[5]:
a_ID CEP ano_ingresso periodo_ingresso status ano_disciplina periodo_disciplina nota disciplina_ID status.disciplina
0 0 59015430 2014 1 CANCELADO 2014 2 2.6 0 Reprovado
1 0 59015430 2014 1 CANCELADO 2015 1 8.0 0 Aprovado
2 1 59073120 2014 1 CANCELADO 2014 2 0.1 0 Reprovado
3 2 59072580 2014 1 ATIVO 2014 2 6.1 0 Aprovado
4 3 59088150 2014 1 ATIVO 2014 1 3.0 0 Reprovado
In [7]:
# Group the data to CEP != 0
df1 = df.loc[df['CEP'] != 0].reset_index(drop = True)
df1.head()
Out[7]:
a_ID CEP ano_ingresso periodo_ingresso status ano_disciplina periodo_disciplina nota disciplina_ID status.disciplina
0 0 59015430 2014 1 CANCELADO 2014 2 2.6 0 Reprovado
1 0 59015430 2014 1 CANCELADO 2015 1 8.0 0 Aprovado
2 1 59073120 2014 1 CANCELADO 2014 2 0.1 0 Reprovado
3 2 59072580 2014 1 ATIVO 2014 2 6.1 0 Aprovado
4 3 59088150 2014 1 ATIVO 2014 1 3.0 0 Reprovado
In [ ]:
# Group the data to 'nota' >= 7
dfNotasMaior5 = df1.loc[df1['nota'] >= 7].reset_index(drop = True)
dfNotasMaior5.head()
In [ ]:
# Group the data to 'nota' < 5
dfNotasMenor5 = df1.loc[df1['nota'] < 5].reset_index(drop = True)
dfNotasMenor5.head()
In [10]:
# We tested the geocoder library with CEP and country as input. Let's create a new column in our dataframe that contains these values.
dfNotasMaior5['geocode_input'] = dfNotasMaior5['CEP'].apply(str) + ', ' + 'Brazil'

# We also create two additional columns for lattitude and longitude.
dfNotasMaior5['lat'], dfNotasMaior5['long'] = [0, 0]

# Display the head of the updated dataframe.
dfNotasMaior5.head()
Out[10]:
a_ID CEP ano_ingresso periodo_ingresso status ano_disciplina periodo_disciplina nota disciplina_ID status.disciplina geocode_input lat long
0 0 59015430 2014 1 CANCELADO 2015 1 8.0 0 Aprovado 59015430, Brazil 0 0
1 3 59088150 2014 1 ATIVO 2014 2 7.2 0 Aprovado 59088150, Brazil 0 0
2 5 59112430 2014 1 ATIVO 2014 2 8.9 0 Aprovado 59112430, Brazil 0 0
3 13 59082180 2014 1 CANCELADO 2015 1 7.3 0 Aprovado 59082180, Brazil 0 0
4 14 59150500 2014 1 ATIVO 2015 2 7.7 0 Aprovado 59150500, Brazil 0 0
In [ ]:
# Create geocoder with data
for i in tqdm(range(len(dfNotasMaior5))):
    g = geocoder.google(dfNotasMaior5.loc[i,'geocode_input'])
    dfNotasMaior5.ix[i,'lat'] = g.lat
    dfNotasMaior5.ix[i,'long'] = g.lng
print('Geocoding complete!')
In [29]:
# Create cluster with data 'nota' >= 7

import folium

from folium.plugins import HeatMap

# Set map center and zoom level
mapc = [-5.9, -35.2]
zoom = 10

coordinates = []

for i in range(len(dfNotasMaior5)):
    # eliminate items with'nan' element
    if all(~np.isnan([dfNotasMaior5.ix[i,'lat'], dfNotasMaior5.ix[i,'long'], dfNotasMaior5.ix[i,'a_ID']])):
        coordinates.append([dfNotasMaior5.ix[i,'lat'], dfNotasMaior5.ix[i,'long'], dfNotasMaior5.ix[i,'a_ID']])

# Create map object
m = folium.Map(location=mapc,
                    zoom_start=zoom)

# Create cluster
marker_cluster = folium.MarkerCluster().add_to(m)

# Plot each of the locations that we geocoded
for j in tqdm(range(len(dfNotasMaior5))):
    folium.Marker([dfNotasMaior5.ix[j,'lat'], dfNotasMaior5.ix[j,'long']],
        icon=folium.Icon(color='green',icon='info-sign')
        ).add_to(marker_cluster)

m
100%|██████████| 1202/1202 [00:27<00:00, 43.17it/s]
Out[29]:
In [31]:
# We tested the geocoder library with CEP and country as input. Let's create a new column in our dataframe that contains these values.
dfNotasMenor5['geocode_input'] = dfNotasMenor5['CEP'].apply(str) + ', ' + 'Brazil'

# We also create two additional columns for lattitude and longitude.
dfNotasMenor5['lat'], dfNotasMenor5['long'] = [0, 0]

# Display the head of the updated dataframe.
dfNotasMenor5.head()
Out[31]:
a_ID CEP ano_ingresso periodo_ingresso status ano_disciplina periodo_disciplina nota disciplina_ID status.disciplina geocode_input lat long
0 0 59015430 2014 1 CANCELADO 2014 2 2.6 0 Reprovado 59015430, Brazil 0 0
1 1 59073120 2014 1 CANCELADO 2014 2 0.1 0 Reprovado 59073120, Brazil 0 0
2 3 59088150 2014 1 ATIVO 2014 1 3.0 0 Reprovado 59088150, Brazil 0 0
3 4 59064245 2014 1 CANCELADO 2014 2 2.1 0 Reprovado 59064245, Brazil 0 0
4 4 59064245 2014 1 CANCELADO 2015 1 2.3 0 Reprovado 59064245, Brazil 0 0
In [42]:
# Create geocoder with data
for i in tqdm(range(len(dfNotasMenor5))):
    g = geocoder.google(dfNotasMenor5.loc[i,'geocode_input'])
    dfNotasMenor5.ix[i,'lat'] = g.lat
    dfNotasMenor5.ix[i,'long'] = g.lng
print('Geocoding complete!')
  0%|          | 0/651 [00:00<?, ?it/s]
  0%|          | 1/651 [00:01<11:11,  1.03s/it]
100%|██████████| 651/651 [09:39<00:00,  1.66it/s]
Geocoding complete!

In [46]:
# Create cluster with data 'nota' >= 7
import folium

from folium.plugins import HeatMap

# Set map center and zoom level
mapc = [-5.9, -35.2]
zoom = 10

coordinates = []

for i in range(len(dfNotasMenor5)):
    # eliminate items with'nan' element
    if all(~np.isnan([dfNotasMenor5.ix[i,'lat'], dfNotasMenor5.ix[i,'long'], dfNotasMenor5.ix[i,'a_ID']])):
        coordinates.append([dfNotasMenor5.ix[i,'lat'], dfNotasMenor5.ix[i,'long'], dfNotasMenor5.ix[i,'a_ID']])

# Create map object
m = folium.Map(location=mapc,
                    zoom_start=zoom)

marker_cluster = folium.MarkerCluster().add_to(m)

#folium.Marker(
#    location=[coordinates],
#    popup='Add popup text here.',
#    icon=folium.Icon(color='green', icon='ok-sign'),
#).add_to(marker_cluster)

# Plot each of the locations that we geocoded
for j in tqdm(range(len(dfNotasMenor5))):
    folium.Marker([dfNotasMenor5.ix[j,'lat'], dfNotasMenor5.ix[j,'long']],
        icon=folium.Icon(color='green',icon='info-sign')
        ).add_to(marker_cluster)

#HeatMap(coordinates).add_to(m)

m
100%|██████████| 1949/1949 [00:48<00:00, 40.15it/s]
Out[46]:
In [ ]: