import pandas as pd 

#import the variable train data
trainVariants=pd.read_csv("training_variants.csv") 

#display trainVariants
trainVariants.head()


#import the train text data file 
trainText=pd.read_csv("training_text.txt", engine="python", sep='\|\|', skiprows=1, names=["ID", "Text"]) 

#display trainText
trainText


#create a function that takes a cell string in the dataframe and cleans that cell (lowercase, no special characters or spacing) 
def clean(cellString):   
    
    #convert to lower case 
    cellString= cellString.lower() 
    
    #convert any special characters to a space 
    cellString = re.sub('[^a-zA-Z0-9\n\.]', ' ', cellString) 
    
    #convert any double spaces to a single space  
    cellString=re.sub('\s+', ' ', cellString)   
    
    #get rid of stop words (aka frequently occuring words)   
    stop_words= set(stopwords.words('english'))
    cleanedSentence=""
    for word in cellString.split():
        if not word in stop_words: 
            cleanedSentence= cleanedSentence+ str(word)+ " "
        
    return cleanedSentence


%%capture output
#run the function on each row of text data 
i=0  
while i<=3320: 
    
    #get cell value 
    value=trainText["Text"][i]  
    value=str(value)
    
    #if the value is a str, then clean the string 
    if len(value)!=0:
        value=clean(value)
        trainText.Text[i]=value  

    #update i 
    i=i+1


trainText


#merge the trainVariant and cleaned trainText data 
data=pd.merge(trainVariants, trainText, how="outer", on=["ID"]) 

#display the merged data 
data.head()


#encode our data to numerical values so we are able to run models on it 
from sklearn.preprocessing import LabelEncoder 

#encode gene, variation, and text 
geneEncoder=LabelEncoder() 
variationEncoder= LabelEncoder() 
textEncoder=LabelEncoder()  

#create a column for each of these new encodings in the data  
data["geneEn"]=geneEncoder.fit_transform(data["Gene"]) 
data['variationEn']=variationEncoder.fit_transform(data['Variation']) 
data['textEn']=textEncoder.fit_transform(data["Text"])


data


#drop Class and non-encoded inputs 
Input=data.drop(["Gene", "Variation", "Text", "Class"], axis='columns')

#just numerical class data
Output=data["Class"]


display(Input.head()) 
display(Output.head())

0    1
1    2
2    2
3    3
4    4
Name: Class, dtype: int64


#split our data into train and test sets  
from sklearn.model_selection import train_test_split 
inputTrain, inputTest, outputTrain, outputTest= train_test_split(Input, Output, test_size=.2)


display(inputTrain.head()) 
display(outputTrain.head())

542     1
387     1
1617    4
2432    1
2087    1
Name: Class, dtype: int64


dataEncoding=data.drop(["ID","Gene", "Variation", "Text"], axis='columns')  
dataEncoding


barchart= plt.bar(classes, counts) 
plt.title('Count of Classes') 
plt.xlabel("Class") 
plt.ylabel("Count")

Text(0, 0.5, 'Count')


correlation= dataEncoding.corr() 
sns.heatmap(correlation, cmap='PuOr')

<AxesSubplot:>


#MODEL 1 IS THE DECISION TREE MODEL 
from sklearn.tree import DecisionTreeClassifier 
model1= DecisionTreeClassifier()  

#have the model fit out train data 
model1.fit(inputTrain, outputTrain)  

#have the model make predictions based off our test input 
predictions1=model1.predict(inputTest)

#see how accurate our predictions of test are by comparing with the test output   
from sklearn.metrics import accuracy_score 
score1= accuracy_score(outputTest, predictions1) #relationship between predictions and outputTest  

#display the score 
score1

0.5218045112781955


from surprise import SVD

#create a reader, and the range of classes 1-9  
reader= Reader(rating_scale= (1,9)) 

#SVD can only have three columns with class listed last, create entire SVD data  
svdData=dataEncoding[["geneEn", 'variationEn', 'Class']]  

#create the data for the model using the reader  
svd=Dataset.load_from_df(svdData, reader)  

#split the data into train and test sets  
svdTrain, svdTest= train_test_split(svd, test_size=.2)  

#create SVD model with 100 latent features(common number to use) 
model= SVD(n_factors=50)  

#fit the model to the svd training data 
model.fit(svdTrain)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f99d25cea00>


#determine the length of the test set 
length= len(svdTest) 

#create an array of the test set class answers 
answers=[] 
i=0 
while i<=length-1: 
    answers.append(svdTest[i][2]) 
    i=i+1


#create an array of the class predictions from the svd test set
predictions=[] 
i=0 
while i<=length-1:  
    #round the predictions to the nearest class 
    predictions.append(round(testing[i][3]))
    i=i+1


#determine the accuracy between the class answers and class predictions from the svd model 
score=accuracy_score(predictions, answers) 
print(score)

0.20150375939849624


#use the trainInput, trainOutput, testInput, and testOutput data from model 1 
model3=RandomForestClassifier(n_estimators=100) 

#have the model fit the train data
model3.fit(inputTrain, outputTrain) 

#have the model make predictions based off out test inputs
predictions3= model3.predict(inputTest) 

#determine the accuracy of the predictions 
score3= accuracy_score(outputTest, predictions3) 

#print the score  
score3

0.6345864661654136


model4=LogisticRegression(solver='liblinear', random_state=0) 

model4.fit(inputTrain, outputTrain) 

intercept= model4.intercept_
b=model4.coef_ 

predictions4= model4.predict(inputTest) 


score4=accuracy_score(predictions4, outputTest) 

score4

0.35037593984962406


# k nearest neighbor 
from sklearn.neighbors import NearestNeighbors 
from sklearn.neighbors import KNeighborsClassifier  

model5= KNeighborsClassifier(n_neighbors=1)  
model5.fit(inputTrain, outputTrain)
predictions5= model5.predict(inputTest) 
 
score5=accuracy_score(predictions5, outputTest) 
score5

0.47218045112781953

	ID	Gene	Variation	Class
0	0	FAM58A	Truncating Mutations	1
1	1	CBL	W802*	2
2	2	CBL	Q249E	2
3	3	CBL	N454D	3
4	4	CBL	L399V	4

	ID	Text
0	0	Cyclin-dependent kinases (CDKs) regulate a var...
1	1	Abstract Background Non-small cell lung canc...
2	2	Abstract Background Non-small cell lung canc...
3	3	Recent evidence has demonstrated that acquired...
4	4	Oncogenic mutations in the monomeric Casitas B...
...	...	...
3316	3316	Introduction Myelodysplastic syndromes (MDS) ...
3317	3317	Introduction Myelodysplastic syndromes (MDS) ...
3318	3318	The Runt-related transcription factor 1 gene (...
3319	3319	The RUNX1/AML1 gene is the most frequent targe...
3320	3320	The most frequent mutations associated with le...

	ID	Text
0	0	cyclin dependent kinases cdks regulate variety...
1	1	abstract background non small cell lung cancer...
2	2	abstract background non small cell lung cancer...
3	3	recent evidence demonstrated acquired uniparen...
4	4	oncogenic mutations monomeric casitas b lineag...
...	...	...
3316	3316	introduction myelodysplastic syndromes mds het...
3317	3317	introduction myelodysplastic syndromes mds het...
3318	3318	runt related transcription factor 1 gene runx1...
3319	3319	runx1 aml1 gene frequent target chromosomal tr...
3320	3320	frequent mutations associated leukemia recurre...

	ID	Gene	Variation	Class	Text
0	0	FAM58A	Truncating Mutations	1	cyclin dependent kinases cdks regulate variety...
1	1	CBL	W802*	2	abstract background non small cell lung cancer...
2	2	CBL	Q249E	2	abstract background non small cell lung cancer...
3	3	CBL	N454D	3	recent evidence demonstrated acquired uniparen...
4	4	CBL	L399V	4	oncogenic mutations monomeric casitas b lineag...

	ID	Gene	Variation	Class	Text	geneEn	variationEn	textEn
0	0	FAM58A	Truncating Mutations	1	cyclin dependent kinases cdks regulate variety...	85	2629	532
1	1	CBL	W802*	2	abstract background non small cell lung cancer...	39	2856	36
2	2	CBL	Q249E	2	abstract background non small cell lung cancer...	39	1897	36
3	3	CBL	N454D	3	recent evidence demonstrated acquired uniparen...	39	1667	1557
4	4	CBL	L399V	4	oncogenic mutations monomeric casitas b lineag...	39	1447	1322
...	...	...	...	...	...	...	...	...
3316	3316	RUNX1	D171N	4	introduction myelodysplastic syndromes mds het...	221	306	970
3317	3317	RUNX1	A122*	1	introduction myelodysplastic syndromes mds het...	221	28	968
3318	3318	RUNX1	Fusions	1	runt related transcription factor 1 gene runx1...	221	807	1642
3319	3319	RUNX1	R80C	4	runx1 aml1 gene frequent target chromosomal tr...	221	2249	1646
3320	3320	RUNX1	K83E	4	frequent mutations associated leukemia recurre...	221	1333	702

Data Analysis on Classifying the Severity of Genetic Mutations ¶

Overview ¶

Original Data ¶

Importing Data ¶

Cleaning Text Data ¶

Merging and Encoding Data ¶

Encoding Data¶

Create Train and Test Sets ¶

Data Observations ¶

Model 1: Decision Tree ¶

Model 2: Single Value Decomposition (SVD) ¶

Model 3: Random Forest ¶

Model 4: Logistic Regression ¶

K Nearest Neighbor ¶

Summary of Results ¶

Random Forest provided the best model, while SVD and Logistic Regression performed the worst.¶

How this Study Can be Furthered¶

Challenges Throughout the Study¶

Questions?¶

Sources ¶

	ID	geneEn	variationEn	textEn
542	542	230	1256	1818
387	387	252	2041	1773
1617	1617	258	2809	643
2432	2432	31	1386	44
2087	2087	2	2202	325