支持向量机文本分类python_Python-基于向量机SVM的文本分类

⽀持向量机⽂本分类python_Python-基于向量机SVM的⽂本分
1.算法介绍
2.代码所⽤数据
⽂件结构
├─doc_classification.py
├─
├─
├─train.data
├─train.label
├─train.map
├─test.data
├─test.label
└─test.map
python代码
需要安装的库:
pandas, liblinearutil
注:Windows平台下 liblinearutil 安装包(32/64)
# doc_classification.py
import pandas as pd花园街5号
import math
from liblinearutil import *
import time
# 读取数据
def loadOriginData(src='train'):
# train.data
dataSrc = r'%s.data' % src
# train.label
labelSrc = r'%s.label' % src
label = pd.read_table(labelSrc, sep=' ', names=['label'])
# train.map
mapSrc = r'%s.map' % src
# 每个⽂档拥有的terms
doc2term = {}
# 每个term出现在哪些⽂档
term2doc = {}
# 每个类别下有哪些docs
cate2docs = {}
# TF值
TF = {}
with open(dataSrc, 'r') as f:
for line in f:
str_docIdx, str_wordIdx, str_cnt = line.split()
docIdx = int(str_docIdx)
wordIdx = int(str_wordIdx)
cnt = int(str_cnt)
# update 数据结构
his系统
doc2term.setdefault(docIdx, []).append(wordIdx)
term2doc.setdefault(wordIdx, []).append(docIdx)
TF.setdefault(docIdx, {})[wordIdx] = cnt
# 统计每个类别下有哪些⽂档
with open(labelSrc, 'r') as f:
for line_index, line in enumerate(f, 1):
labelVal = int(line.strip())
cate2docs.setdefault(labelVal, []).append(line_index)
return TF, doc2term, term2doc, cate2docs, label
# 特征选择
def featureSel(doc2term, term2doc, cate2docs):
# CHI衡量的是特征项ti和类别Cj之间的关联程度, A,B, C, D是四个统计量CHI_cat2term = {}
# N:total number of documents
N = len(doc2term)
# A + B + C + D = N
# A: term出现在某类别中的⽂档总数
A = {}
# B: term出现在除某类别外的其他⽂档数
B = {}
# C: 该类别中不包含term的⽂档总数
C = {}
# D: 其他类别中不包含term的⽂档总数
D = {}
DF = {}
# 所有类别
categories = list(cate2docs.keys())
# 停⽤词词汇表
stopwords = {}
stopwordsSrc = '
with open(stopwordsSrc) as f:
for line in f:
stopwords[line.strip()] = True
# 训练数据数据词汇表
vocSrc = '
voc = pd.read_table(vocSrc, names=['voc'])
# 保存所有的特征
features = set()
# 计算⼀个类别标签下各个词的CHI
for category in categories:
# 属于第category类的⽂档为docs
docs = cate2docs[category]
sumVal = 0
for term in term2doc:
# 如果是停⽤词, 则将CHI置零
(voc['voc'][term - 1], False):
CHI_cat2term.setdefault(category, {})[term] = 0 continue
# 属于某类且包含term
AVal = len(set(term2doc[term]).intersection(set(docs))) # 不属于某类但包含term
BVal = len(term2doc[term]) - AVal
# 属于某类,但不包含term
CVal = len(docs) - AVal
# 不属于某类, 不包含term
DVal = N - AVal - BVal - CVal
CHIVal = N * (AVal * DVal - CVal * BVal)**2 / ((AVal + CVal) * (BVal + DVal) * (AVal + BVal) * (CVal + DVal)) # CHIVal = math.log(AVal * N / ((AVal + CVal) * (AVal + BVal)))
A.setdefault((term, category), AVal)
B.setdefault((term, category), BVal)
C.setdefault((term, category), CVal)
D.setdefault((term, category), DVal)
实证研究
CHI_cat2term.setdefault(category, {})[term] = CHIVal
DF[term] = AVal + BVal
sumVal += CHIVal
# 选出类别中CHI⾼于平均值的词
terms = CHI_cat2term[category]
meanVal = sumVal / len(terms)
for term in terms:
if CHI_cat2term[category][term] > meanVal:
features.add(term)
# for feature in features:
# print(voc['voc'][feature])
太阳能热泵
print('There are %d features in VSM model.\n' % len(features))
return features, DF
def buildSVMData(TF, DF, features, N, label, cate2docs, doc2terms):
外汇百科
isFeatures = dict(zip(features, [True] * len(features)))
categories = list(cate2docs.keys())
# 如果是训练样本, 则计算归⼀化缩放因⼦,并返回
# y: label值
y = [0] * N
# x: 稀疏矩阵
x = []
for i in range(N):
x.append({})
for category in categories:
for doc in cate2docs[category]:
# 给y进⾏标记类别
y[doc - 1] = label.iat[doc - 1, 0]
scale_factor = -100
for term in doc2terms[doc]:
(term, False): # 如果term是特征
# TF值
TFVal = TF[doc].get(term, 0)
# TF-IDF值
tf_idf = TFVal * math.log(N / DF[term])
欲望号列车x[doc - 1][term] = tf_idf
# 更新特征最⼤值
if scale_factor < tf_idf:
scale_factor = tf_idf
alpha = 0
# 按⼀篇⽂档中特征词最⼤的tf-idf, 对该⽂档中的所有特征词进⾏归⼀化
for term in doc2terms[doc]:
(term, False): # 如果term是特征
# x[doc - 1][term] = alpha + (1 - alpha) * x[doc - 1][term] / scale_factor x[doc - 1][term] /= scale_factor
print("Data for SVM has been built.\n")
return x, y
# 计算DF
def getDF(doc2term, term2doc, cate2docs):
DF = {}
for term in term2doc:
DF[term] = len(term2doc[term])
return DF
if __name__ == '__main__':
start = time.time()
# # 主程序
TF, doc2term, term2doc, cate2docs, label = loadOriginData()
# 特征选择
features, DF = featureSel(doc2term, term2doc, cate2docs)
# 读取数据(train.data)
TF, doc2term, term2doc, cate2docs, label = loadOriginData()

本文发布于:2024-09-24 09:15:44,感谢您对本站的认可!

本文链接:https://www.17tex.com/xueshu/570007.html

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。

标签:类别   数据   计算   包含   特征   向量   算法   需要
留言与评论(共有 0 条评论)
   
验证码:
Copyright ©2019-2024 Comsenz Inc.Powered by © 易纺专利技术学习网 豫ICP备2022007602号 豫公网安备41160202000603 站长QQ:729038198 关于我们 投诉建议