読者です 読者をやめる 読者になる 読者になる

Proxy分析2

自分用

フォーマット固定して速くした。



Proxy分析用

INPUTDATA

seq,send,count
1,45282,10
2,45326,54
3,45402,88
4,45470,1
5,45474,72
6,45597,26

OUTPUTDATA
偏差値CSV

# coding:utf-8
##入力フォーマットが決まっているのはあれ。


import pandas as pd
import numpy as np
import math
ini = pd.read_csv("proxy_dataA.csv")

################################変数定義
seq = "seq"
send = "send"
count = "count"
score = []
ml = []

################################統計関数定義
#★合計
def cus_sum(a):
	f_sum = 0
	for key,data in a.iterrows():
		f_sum += (data[send]*data[count])
	return f_sum
#sum_c = cus_sum(ini)
#print "★合計\t:",sum_c

#★サンプル件数
def cus_sampling(a):
	f_count = 0
	for key,data in a.iterrows():
		f_count += data[count]
	return f_count
#sampling_c = cus_sampling(ini)
#print "★サンプル数\t:",sampling_c

#★平均
def cus_mean(a):
	return float(cus_sum(a))/cus_sampling(a)
#mean_c = cus_mean(ini)
#print "★平均\t:",mean_c

#★分散
def cus_var(a):
	f_var = 0
	f_mean = cus_mean(a)
	for key,data in a.iterrows():
		f_var +=  ((data[send] - f_mean) ** 2 ) * data[count]
	f_var = f_var / cus_sampling(a)
	return f_var
#var_c = cus_var(ini)
#print "★分散\t:",var_c

#★標準偏差
def cus_std(a):
	return math.sqrt(cus_var(a))
#std_c = cus_std(ini)
#print "★標準偏差\t:",std_c

#★偏差値
def cus_std_score(a):
	f_std_score = []
	for key,data in a.iterrows():
		f_std_score.append((((data[send] - cus_mean(a)) * 10 ) / cus_std(a)) + 50)
	return f_std_score
#std_score_c = cus_std_score(ini)
#print std_score_c

################################MAIN

sum_c = cus_sum(ini)
print "★合計\t:",sum_c

sampling_c = cus_sampling(ini)
print "★サンプル数\t:",sampling_c

mean_c = cus_mean(ini)
print "★平均\t:",mean_c

var_c = cus_var(ini)
print "★分散\t:",var_c

std_c = cus_std(ini)
print "★標準偏差\t:",std_c

std_score_c = cus_std_score(ini)
print "★偏差値\np",std_score_c


np.savetxt("std_score.txt", np.c_[ini,std_score_c], delimiter="\t", fmt="%.2f")


################################[おまけ]ヒストグラム作成
def cus_histgram_data(f_std_score,divide):
	from collections import Counter
	f_range_max = (int(max(f_std_score)/divide)+1)*divide #グラフ最大値
	f_scale = f_range_max/divide #目盛り増加
	print "ヒストグラムデータ→","目盛り:",f_scale,"右端:",f_range_max,"分割数:",divide

	ini_std_c = pd.DataFrame(np.c_[ini,f_std_score])
	ini_std_c.columns = ['seq','send','count','std_score']

	f_hist_key = []
	for key,data in ini_std_c.iterrows():
		f_hist_key.append(int(data["std_score"]/f_scale)*f_scale)
	print Counter(f_hist_key)
	return Counter(f_hist_key),f_hist_key

#print cus_histgram_data(std_score_c,2)
divide = 100000
hist_c,hist_key_c = cus_histgram_data(std_score_c,divide)
hist_c = list(hist_c.items())


np.savetxt("histgram_data.txt", hist_c, delimiter="\t", fmt="%.0f")




#グラフ描写
import matplotlib.pyplot as plt
x =  np.array(hist_key_c)

plt.yscale("log")
plt.grid(which="both")
plt.ylim([0.1,max(hist_c[1])])
plt.hist(x, label = "Send Byte", bins=100,alpha = 0.5, color = "red")#range = (0,(int(max(std_score_c)/divide)+1)*divide)
#plt.legend()
plt.show()

1億件(7分)
汚いグラフだなー…
f:id:gokaxtukei:20170212201538p:plain