Theoverallobjectiveofthistoolkitistoprovideandofferafreecollectionofdataanalysisandmachinelearningthatisspecificallysuitedfordoingdatascience.Itspurposeistogetyoustartedinamatterofminutes.YoucanrunthiscollectionseitherinJupyternotebookorpythonalone.
FeaturesMachineLearningCross-ValidationEvaluatingClassificationMetricsEvaluatingClusteringMetricsEvaluatingRegressionMetricsGridSearchPreprocessingEncodingCategoricalFeaturesPreprocessingBinarizationPreprocessingImputingMissingValuesPreprocessingNormalizationPreprocessingStandardScalerRandomizedParameterOptimizationNumpyAdding,Removing,andSplittingArraysSortingarraysMatrixobjectStatisticsVectorMathStructuredArraysImport,Export,Slicing,IndexingDatatofromstringPandasCompletepandasGroupbyinPandasMappingFilteringApplyingVisualizationBarPlotsCustomizationMatplotlibWorkingwithImageWorkingwithtextNamingConventionsThenamingconvectionsIfollowedis:[yyyy-mm-dd-in-project-name-library].extentionyyyy=standsforyearmm=standsformonthdd=standsfordayin=myinitial,forexample:SalebanOlow=solibrary=numpy,pandas,sklearn,matplotlibproject-name=eachprojectnameextention=.ipynb,.py,.htmlExample:2017-25-11-so-cross-validation-sklearn.ipynbCodeSamples:CrossValidation
fromsklearn.model_selectionimportcross_val_scoremodel=SVC(kernel='linear',C=1)#let'stryitusingcvscores=cross_val_score(model,X,y,cv=5)GridSearch
fromsklearn.grid_searchimportGridSearchCVparams={"n_neighbors":np.arange(1,5),"metric":["euclidean","cityblock"]}grid=GridSearchCV(estimator=knn,param_grid=params)grid.fit(X_train,y_train)print(grid.best_score)print(grid.best_estimator_.n_neighbors)PreprocessingImputingMissingValues
fromsklearn.preprocessingimportImputerimpute=Imputer(missing_values=0,strategy='mean',axis=0)impute.fit_transform(X_train)RandomizedParameterOptimization
fromsklearn.grid_searchimportRandomizedSearchCVparams={"n_neighbors":range(1,5),"weights":["uniform","distance"]}rsearch=RandomizedSearchCV(estimator=knn,param_distributions=params,cv=4,n_iter=8,random_state=5)rsearch.fit(X_train,y_train)print(rsearch.best_score_)Modelfittingsupervisedandunsupervisedlearning
#supervisedlearningfromsklearnimportneighborsknn=neighbors.KNeighborsClassifier(n_neighbors=5)knn.fit(X_train,y_train)#unsupervisedlearningfromsklearn.decompositionimportPCApca=PCA(n_components=0.95)pca_model=pca.fit_transform(X_train)Workingwithnumpyarrays
importnumpyasnp#appendsvaluestoendofarrnp.append(arr,values)#insertsvaluesintoarrbeforeindex2np.insert(arr,2,values)IndexingandSlicingarrays
importnumpyasnp#returntheelementatindex5arr=np.array([[1,2,3,4,5,6,7]])arr[5]#returnsthe2Darrayelementonindexarr[2,5]#assignarrayelementonindex1thevalue4arr[1]=4#assignarrayelementonindex[1][3]thevalue10arr[1,3]=10CreatingDataFrame
importpandasaspd#specifyvaluesforeachrowsandcolumnsdf=pd.DataFrame([[4,7,10],[5,8,11],[6,9,12]],index=[1,2,3],columns=['a','b','c'])groupbypandas
importpandasaspdimportpandasaspd#returnagroupbyobject,groupedbyvaluesincolumnnamed'cities'df.groupby(by="Cities")handlingmissingvalues
importpandasaspd#droprowswithanycolumnhavingNA/nulldata.df.dropna()#replaceallNA/nulldatawithvaluedf.fillna(value)Meltfunction
importpandasaspd#mostpandasmethodsreturnaDataFramesothat#thisimprovesreadabilityofcodedf=(pd.melt(df).rename(columns={'old_name':'new_name','old_name':'new_name'}).query('new_name>=200'))Saveplot
mportmatplotlib.pyplotasplt#savesplot/figuretoimageplt.savefig('pic_name.png')Marker,lines
importmatplotlib.pyplotasplt#add*foreverydatapointplt.plot(x,y,marker='*')#addsdotforeverydatapointplt.plot(x,y,marker='.')Figures,Axis
importmatplotlib.pyplotasplt#acontainerthatcontainsallplotelementsfig=plt.figures()#Initializessubplotfig.add_axes()#Asubplotisanaxesonagridsystem,rows-colsnuma=fig.add_subplot(222)#addssubplotfig,b=plt.subplots(nrows=3,ncols=2)#createssubplotax=plt.subplots(2,2)Workingwithtextplot
importmatplotlib.pyplotasplt#placestextatcoordinates1/1plt.text(1,1,'Exampletext',style='italic')#annotatethepointwithcoordinatesxywithtextax.annotate('someannotation',xy=(10,10))#justputmathformulaplt.title(r'$delta_i=20$',fontsize=10)
评论