MLの流れをもう少し詳しく把握してみよう。
1 |
test_model.take(1) |
で返されたmodelの構造を良く見てみる。
[Row(
INFANT_ALIVE_AT_REPORT=0,
BIRTH_PLACE=’1′,
0) MOTHER_AGE_YEARS=13,
1) FATHER_COMBINED_AGE=99,
CIG_BEFORE=0,
CIG_1_TRI=0,
CIG_2_TRI=0,
CIG_3_TRI=0,
6) MOTHER_HEIGHT_IN=66,
7) MOTHER_PRE_WEIGHT=133,
8) MOTHER_DELIVERY_WEIGHT=135,
9) MOTHER_WEIGHT_GAIN=2,
DIABETES_PRE=0,
DIABETES_GEST=0,
HYP_TENS_PRE=0,
HYP_TENS_GEST=0,
PREV_BIRTH_PRETERM=0,
BIRTH_PLACE_INT=1,
16) BIRTH_PLACE_VEC=SparseVector(9, {1: 1.0}),
features=SparseVector(24, {0: 13.0, 1: 99.0, 6: 66.0, 7: 133.0, 8: 135.0, 9: 2.0, 16: 1.0}),
rawPrediction=DenseVector([1.0545, -1.0545]), probability=DenseVector([0.7416, 0.2584]),
prediction=0.0
)]
1 2 3 4 |
import pyspark.ml.feature as ft featuresCreator = ft.VectorAssembler( inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features' ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
encoder = ft.OneHotEncoder( inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC') feature_vectors = encoder.transform(births) feature_vectors.show(truncate=False) +----------------------+-----------+----------------+-------------------+----------+---------+---------+---------+----------------+-----------------+----------------------+------------------+------------+-------------+------------+-------------+------------------+---------------+---------------+ |INFANT_ALIVE_AT_REPORT|BIRTH_PLACE|MOTHER_AGE_YEARS|FATHER_COMBINED_AGE|CIG_BEFORE|CIG_1_TRI|CIG_2_TRI|CIG_3_TRI|MOTHER_HEIGHT_IN|MOTHER_PRE_WEIGHT|MOTHER_DELIVERY_WEIGHT|MOTHER_WEIGHT_GAIN|DIABETES_PRE|DIABETES_GEST|HYP_TENS_PRE|HYP_TENS_GEST|PREV_BIRTH_PRETERM|BIRTH_PLACE_INT|BIRTH_PLACE_VEC| +----------------------+-----------+----------------+-------------------+----------+---------+---------+---------+----------------+-----------------+----------------------+------------------+------------+-------------+------------+-------------+------------------+---------------+---------------+ |0 |1 |29 |99 |0 |0 |0 |0 |99 |999 |999 |99 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) | |0 |1 |22 |29 |0 |0 |0 |0 |65 |180 |198 |18 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) | |0 |1 |38 |40 |0 |0 |0 |0 |63 |155 |167 |12 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) | |0 |1 |39 |42 |0 |0 |0 |0 |60 |128 |152 |24 |0 |0 |0 |0 |1 |1 |(9,[1],[1.0]) | |0 |1 |18 |99 |6 |4 |2 |2 |61 |110 |130 |20 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) | |0 |1 |32 |37 |0 |0 |0 |0 |66 |150 |162 |12 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) | |0 |1 |22 |25 |0 |0 |0 |0 |68 |155 |191 |36 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) | |0 |1 |25 |26 |0 |0 |0 |0 |64 |136 |169 |33 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) | |0 |1 |26 |32 |0 |0 |0 |0 |64 |140 |147 |7 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) | |0 |1 |39 |66 |0 |0 |0 |0 |65 |140 |150 |10 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) | |0 |1 |25 |22 |0 |0 |0 |0 |62 |145 |152 |7 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) | |0 |1 |33 |99 |0 |0 |0 |0 |65 |145 |145 |0 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) | |0 |1 |25 |99 |0 |0 |0 |0 |64 |140 |165 |25 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) | |0 |1 |29 |99 |0 |0 |0 |0 |60 |115 |120 |5 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) | |0 |1 |28 |29 |0 |0 |0 |0 |66 |320 |318 |0 |0 |0 |1 |0 |1 |1 |(9,[1],[1.0]) | |0 |1 |23 |28 |0 |0 |0 |0 |64 |120 |141 |21 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) | |0 |1 |31 |41 |0 |0 |0 |0 |59 |106 |142 |36 |0 |0 |0 |0 |1 |1 |(9,[1],[1.0]) | |0 |1 |27 |99 |0 |0 |0 |0 |66 |213 |200 |0 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) | |0 |1 |28 |27 |0 |0 |0 |0 |66 |165 |224 |59 |0 |0 |0 |0 |1 |1 |(9,[1],[1.0]) | |0 |1 |34 |31 |0 |0 |0 |0 |70 |130 |134 |4 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) | +----------------------+-----------+----------------+-------------------+----------+---------+---------+---------+----------------+-----------------+----------------------+------------------+------------+-------------+------------+-------------+------------------+---------------+---------------+ only showing top 20 rows |
|INFANT_ALIVE_AT_REPORT: 0
|BIRTH_PLACE: 1
|MOTHER_AGE_YEARS: 29
|FATHER_COMBINED_AGE: 99
|CIG_BEFORE: 0
|CIG_1_TRI: 0
|CIG_2_TRI: 0
|CIG_3_TRI: 0
|MOTHER_HEIGHT_IN: 99
|MOTHER_PRE_WEIGHT: 999
|MOTHER_DELIVERY_WEIGHT: 999
|MOTHER_WEIGHT_GAIN: 99
|DIABETES_PRE: 0
|DIABETES_GEST: 0
|HYP_TENS_PRE: 0
|HYP_TENS_GEST: 0
|PREV_BIRTH_PRETERM: 0
|BIRTH_PLACE_INT: 1
|BIRTH_PLACE_VEC:(9,[1],[1.0])
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
featuresCreator = ft.VectorAssembler( inputCols=[ col[0] for col in labels[2:]] + \ [encoder.getOutputCol()], outputCol='features' ) feature_vectors_2 = featuresCreator.transform(feature_vectors) feature_vectors_2.show(truncate=False) +----------------------+-----------+----------------+-------------------+----------+---------+---------+---------+----------------+-----------------+----------------------+------------------+------------+-------------+------------+-------------+------------------+---------------+---------------+------------------------------------------------+ |INFANT_ALIVE_AT_REPORT|BIRTH_PLACE|MOTHER_AGE_YEARS|FATHER_COMBINED_AGE|CIG_BEFORE|CIG_1_TRI|CIG_2_TRI|CIG_3_TRI|MOTHER_HEIGHT_IN|MOTHER_PRE_WEIGHT|MOTHER_DELIVERY_WEIGHT|MOTHER_WEIGHT_GAIN|DIABETES_PRE|DIABETES_GEST|HYP_TENS_PRE|HYP_TENS_GEST|PREV_BIRTH_PRETERM|BIRTH_PLACE_INT|BIRTH_PLACE_VEC|features | +----------------------+-----------+----------------+-------------------+----------+---------+---------+---------+----------------+-----------------+----------------------+------------------+------------+-------------+------------+-------------+------------------+---------------+---------------+------------------------------------------------+ |0 |1 |29 |99 |0 |0 |0 |0 |99 |999 |999 |99 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) |(11,[0,1],[99.0,999.0]) | |0 |1 |22 |29 |0 |0 |0 |0 |65 |180 |198 |18 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) |(11,[0,1],[65.0,180.0]) | |0 |1 |38 |40 |0 |0 |0 |0 |63 |155 |167 |12 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) |(11,[0,1],[63.0,155.0]) | |0 |1 |39 |42 |0 |0 |0 |0 |60 |128 |152 |24 |0 |0 |0 |0 |1 |1 |(9,[1],[1.0]) |(11,[0,1,6],[60.0,128.0,1.0]) | |0 |1 |18 |99 |6 |4 |2 |2 |61 |110 |130 |20 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) |(11,[0,1,7,8,9,10],[61.0,110.0,6.0,4.0,2.0,2.0])| |0 |1 |32 |37 |0 |0 |0 |0 |66 |150 |162 |12 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) |(11,[0,1],[66.0,150.0]) | |0 |1 |22 |25 |0 |0 |0 |0 |68 |155 |191 |36 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) |(11,[0,1],[68.0,155.0]) | |0 |1 |25 |26 |0 |0 |0 |0 |64 |136 |169 |33 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) |(11,[0,1],[64.0,136.0]) | |0 |1 |26 |32 |0 |0 |0 |0 |64 |140 |147 |7 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) |(11,[0,1],[64.0,140.0]) | |0 |1 |39 |66 |0 |0 |0 |0 |65 |140 |150 |10 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) |(11,[0,1],[65.0,140.0]) | |0 |1 |25 |22 |0 |0 |0 |0 |62 |145 |152 |7 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) |(11,[0,1],[62.0,145.0]) | |0 |1 |33 |99 |0 |0 |0 |0 |65 |145 |145 |0 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) |(11,[0,1],[65.0,145.0]) | |0 |1 |25 |99 |0 |0 |0 |0 |64 |140 |165 |25 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) |(11,[0,1],[64.0,140.0]) | |0 |1 |29 |99 |0 |0 |0 |0 |60 |115 |120 |5 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) |(11,[0,1],[60.0,115.0]) | |0 |1 |28 |29 |0 |0 |0 |0 |66 |320 |318 |0 |0 |0 |1 |0 |1 |1 |(9,[1],[1.0]) |(11,[0,1,4,6],[66.0,320.0,1.0,1.0]) | |0 |1 |23 |28 |0 |0 |0 |0 |64 |120 |141 |21 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) |(11,[0,1],[64.0,120.0]) | |0 |1 |31 |41 |0 |0 |0 |0 |59 |106 |142 |36 |0 |0 |0 |0 |1 |1 |(9,[1],[1.0]) |(11,[0,1,6],[59.0,106.0,1.0]) | |0 |1 |27 |99 |0 |0 |0 |0 |66 |213 |200 |0 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) |(11,[0,1],[66.0,213.0]) | |0 |1 |28 |27 |0 |0 |0 |0 |66 |165 |224 |59 |0 |0 |0 |0 |1 |1 |(9,[1],[1.0]) |(11,[0,1,6],[66.0,165.0,1.0]) | |0 |1 |34 |31 |0 |0 |0 |0 |70 |130 |134 |4 |0 |0 |0 |0 |0 |1 |(9,[1],[1.0]) |(11,[0,1],[70.0,130.0]) | +----------------------+-----------+----------------+-------------------+----------+---------+---------+---------+----------------+-----------------+----------------------+------------------+------------+-------------+------------+-------------+------------------+---------------+---------------+------------------------------------------------+ only showing top 20 rows BIRTH_PLACE_VEC|features | (9,[1],[1.0]) |(11,[0,1],[99.0,999.0]) |