Ana's notebook
Hypothesis: There is relation between mc, approval and enem grades?
This analysis consists of three steps:
!pip install -U scikit-learn
from sklearn.cluster import AgglomerativeClustering, AffinityPropagation, KMeans
from bokeh.plotting import figure
from bokeh.io import output_notebook, show
from bokeh.palettes import Set1
# Import pandas as pd
import pandas as pd
import random
import folium
import geocoder #GEOCODER
data = pd.read_csv('../data/imd-student-blind-complete.csv')
data.info()
data = data.sort_values(by='a_ID', ascending=True)
data.head()
students = dict()
for index, row in data.iterrows():
id = row['a_ID']
if not id in students:
students[id] = []
students[id].append(row)
# MC calculation: based on success of student in subjects and weight based on number of hours of each subject
w = [60.0, 30.0, 180.0, 90.0, 30.0, 90.0, 60.0]
def weighted_average(grades, weights):
l = len(grades)
s = 0
for g in range(0, l):
s += grades[g]*weights[g]
wa = s/sum(weights)
return wa
def get_weight(index):
return w[int(index)]
The function called points returns a list of list with two elements, the first is the percentage and the second is MC (corresponding a each student). It also returns a list with postal code of all students.
def points(students):
p = []
for k, value in students.items():
aprv = 0.0
mc = 0.0
grades = []
weights = []
if value[0]['enen-nota'] > 0:
for row in value:
if row['status.disciplina'] == 'Aprovado':
grades.append(row['nota'])
weights.append(get_weight(row['disciplina_ID']))
aprv+=1.0
if len(grades) > 0:
mc = weighted_average(grades, weights)
if len(value) > 0:
aprv = aprv/len(value)
enem = row['enen-nota']
p.append([[aprv, mc, enem], value[0]['CEP']])
return p
Now we use a sklearn.cluster method to generate clusterings of students called kmeans.
k-means clustering is a method of vector quantization, originally from signal processing, that is popular for cluster analysis in data mining. k-means clustering aims to partition n observations into k clusters in which each observation belongs to the cluster with the nearest mean, serving as a prototype of the cluster. (source: Wikipedia)
Data is represented by point in euclidian space (2d), where axis x is the percentage and axis y is MC.
Number of clusterings was fixed in 3.
n = 3 # number of clusterings
pts_ceps = points(students)
pts = [pts_ceps[i][0] for i in range(0, len(pts_ceps))]
ceps = [pts_ceps[i][1] for i in range(0, len(pts_ceps))]
clusterings = KMeans(n_clusters=n, random_state=0).fit(pts)
from bokeh.models import ColumnDataSource, LayoutDOM
# this code is to plot the clusterings
plot1 = figure(x_axis_label ='aprv (perc)', y_axis_label ='enem')
plot2 = figure(x_axis_label ='mc', y_axis_label ='enem')
labels = clusterings.labels_
x_axis = [None] * n
y_axis = [None] * n
z_axis = [None] * n
ceps_by_label = [None] * n
for i in range(0, n):
x_axis[i] = []
y_axis[i] = []
z_axis[i] = []
ceps_by_label[i] = []
for p, l, c in zip(pts, labels, ceps):
if l == i:
x_axis[i].append(p[0])
y_axis[i].append(p[1])
z_axis[i].append(p[2])
ceps_by_label[i].append(c)
plot1.circle(x_axis[i], z_axis[i], color=Set1[n][i], legend='Label ' + str(i))
plot2.circle(y_axis[i], z_axis[i], color=Set1[n][i], legend='Label ' + str(i))
output_notebook()
show(plot1)
show(plot2)
# prints informations about clusterings
def infos(x_axis, y_axis, z_axis):
average_MC = sum(x_axis)/len(x_axis)
average_aprv = sum(y_axis)/len(y_axis)
average_enem = sum(z_axis)/len(z_axis)
print(" average of aprv: " + str(average_MC) + '\n average of MC: '
+ str(average_aprv) + '\n average of enem: ' + str(average_enem))
def get_random(ceps, label):
size = int(len(ceps)*0.3)
random_numbers = random.sample(range(len(ceps)), size)
students_ceps = []
for i in random_numbers:
students_ceps.append((ceps[i], label))
return students_ceps
students_ceps = [None] * n
for i, x, y, z, c in zip(range(0, n), x_axis, y_axis, z_axis, ceps_by_label):
print('Label ' + str(i) + ': \n size: ' + str(len(x)))
infos(x, y, z)
students_ceps[i] = get_random(c, i)
In the last step, we use kmeans to generate diferents clusterings of students, now we use postal code to view in the map how these groups are distributed.
print('Just plot postal code location of some students (randomic selected)')
for i, s in zip(range(0, len(students_ceps)), students_ceps):
print('Label ' + str(i) + ': number of students used ' + str(len(s)))
mapc = [0, 30]
zoom = 2
colors = ["red", "blue", "green", "orange", "gray", "yellow"]
map_students = folium.Map(location=mapc, zoom_start=zoom)
# students_cep contains n lists of tuples: first cep, second label of group
for cluster in students_ceps:
for student in cluster:
cep = student[0]
if cep > 0.001:
label = student[1]
g = geocoder.google(str(cep))
lat = g.lat
lng = g.lng
if lat != None and lng != None:
folium.Marker([lat, lng], icon=folium.Icon(color=colors[label],icon='info-sign')).add_to(map_students)
map_students
Analyzing superficially, it is possible to notice that there is a relation between academic indexes, enem grade and home location.