# 自然语言处理基础:从零开始构建文本分类器

自然语言处理基础:从零开始构建文本分类器

自然语言处理(NLP)是人工智能领域中最具挑战性和实用性的分支之一。它使计算机能够理解、解释和生成人类语言。本文将带你从零开始,深入理解NLP的核心概念,并通过构建一个完整的文本分类器来掌握实用技能。

一、NLP基础概念

1.1 什么是自然语言处理?

自然语言处理是计算机科学、人工智能和语言学的交叉领域,旨在让计算机能够”理解”人类语言。这里的”理解”包括:

  • 识别文本中的实体、关系和情感
  • 提取关键信息
  • 生成自然语言响应
  • 翻译不同语言

1.2 NLP的主要任务

  1. 文本分类:将文本分配到预定义的类别
  2. 命名实体识别:识别文本中的人名、地名、组织名等
  3. 情感分析:判断文本的情感倾向
  4. 机器翻译:将一种语言翻译成另一种语言
  5. 问答系统:根据问题提供准确答案
  6. 文本生成:自动生成连贯的文本

🚀 二、NLP处理流程

2.1 文本预处理

文本预处理是NLP的第一步,目的是将原始文本转换为适合机器学习模型处理的格式。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# 下载必要的NLTK数据
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
"""
文本预处理函数
"""
# 1. 转换为小写
text = text.lower()

# 2. 移除特殊字符和数字
text = re.sub(r'[^a-zA-Z\s]', '', text)

# 3. 分词
tokens = word_tokenize(text)

# 4. 移除停用词
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]

# 5. 词形还原
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(word) for word in tokens]

return ' '.join(tokens)

# 示例
sample_text = "Natural Language Processing is amazing! It helps computers understand human language."
processed_text = preprocess_text(sample_text)
print(f"原始文本: {sample_text}")
print(f"处理后: {processed_text}")

2.2 文本表示方法

计算机无法直接理解文本,需要将文本转换为数值表示。

2.2.1 词袋模型(Bag of Words)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
from sklearn.feature_extraction.text import CountVectorizer

# 示例文本
documents = [
"I love natural language processing",
"Natural language processing is interesting",
"I enjoy learning about AI and NLP"
]

# 创建词袋模型
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)

print("词汇表:", vectorizer.get_feature_names_out())
print("词袋矩阵:")
print(X.toarray())

2.2.2 TF-IDF(词频-逆文档频率)

1
2
3
4
5
6
7
8
9
from sklearn.feature_extraction.text import TfidfVectorizer

# 创建TF-IDF向量器
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(documents)

print("TF-IDF矩阵:")
print(X_tfidf.toarray())
print("特征名称:", tfidf_vectorizer.get_feature_names_out())

2.2.3 词嵌入(Word Embeddings)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import numpy as np
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

# 训练简单的Word2Vec模型
sentences = [
["natural", "language", "processing", "is", "fun"],
["machine", "learning", "is", "interesting"],
["deep", "learning", "and", "nlp", "are", "related"]
]

# 训练模型
model = Word2Vec(sentences, vector_size=50, window=3, min_count=1, workers=4)

# 获取词向量
word_vector = model.wv['natural']
print(f"'natural'的词向量维度: {word_vector.shape}")
print(f"'natural'和'language'的相似度: {model.wv.similarity('natural', 'language')}")

✨ 三、构建文本分类器

让我们通过一个完整的例子来构建一个情感分析分类器。

3.1 数据准备

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups

# 加载数据集
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
newsgroups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# 创建DataFrame
df = pd.DataFrame({
'text': newsgroups.data,
'target': newsgroups.target,
'target_name': [newsgroups.target_names[i] for i in newsgroups.target]
})

print(f"数据集大小: {len(df)}")
print(f"类别分布:\n{df['target_name'].value_counts()}")

# 数据预处理
df['processed_text'] = df['text'].apply(preprocess_text)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
df['processed_text'],
df['target'],
test_size=0.2,
random_state=42,
stratify=df['target']
)

3.2 特征工程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# 创建TF-IDF特征
tfidf = TfidfVectorizer(
max_features=5000,
min_df=5,
max_df=0.7,
ngram_range=(1, 2) # 包含unigram和bigram
)

# 转换训练数据
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"训练集特征维度: {X_train_tfidf.shape}")
print(f"测试集特征维度: {X_test_tfidf.shape}")

# 可选:使用SVD降维
svd = TruncatedSVD(n_components=100, random_state=42)
X_train_svd = svd.fit_transform(X_train_tfidf)
X_test_svd = svd.transform(X_test_tfidf)

print(f"降维后训练集维度: {X_train_svd.shape}")

3.3 模型训练

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

def train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name):
"""
训练和评估模型
"""
print(f"\n{'='*50}")
print(f"训练 {model_name}...")

# 训练模型
model.fit(X_train, y_train)

# 预测
y_pred = model.predict(X_test)

# 评估
accuracy = accuracy_score(y_test, y_pred)
print(f"准确率: {accuracy:.4f}")

print("\n分类报告:")
print(classification_report(y_test, y_pred, target_names=newsgroups.target_names))

return model, accuracy

# 尝试不同的模型
models = {
"逻辑回归": LogisticRegression(max_iter=1000, random_state=42),
"朴素贝叶斯": MultinomialNB(),
"支持向量机": LinearSVC(random_state=42),
"随机森林": RandomForestClassifier(n_estimators=100, random_state=42)
}

results = {}
for name, model in models.items():
trained_model, accuracy = train_and_evaluate(
model, X_train_tfidf, X_test_tfidf, y_train, y_test, name
)
results[name] = {
'model': trained_model,
'accuracy': accuracy
}

3.4 模型优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# 创建管道
pipeline = Pipeline([
('tfidf', TfidfVectorizer()),
('clf', LogisticRegression(max_iter=1000, random_state=42))
])

# 定义参数网格
parameters = {
'tfidf__max_features': [3000, 5000, 10000],
'tfidf__ngram_range': [(1, 1), (1, 2)],
'clf__C': [0.1, 1, 10],
'clf__penalty': ['l1', 'l2']
}

# 网格搜索
grid_search = GridSearchCV(
pipeline,
parameters,
cv=3,
n_jobs=-1,
verbose=1,
scoring='accuracy'
)

print("开始网格搜索...")
grid_search.fit(X_train, y_train)

print(f"\n最佳参数: {grid_search.best_params_}")
print(f"最佳交叉验证分数: {grid_search.best_score_:.4f}")

# 使用最佳模型进行预测
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
best_accuracy = accuracy_score(y_test, y_pred_best)
print(f"测试集准确率: {best_accuracy:.4f}")

3.5 模型部署和预测

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import joblib
import pickle

# 保存最佳模型
model_filename = 'text_classifier.pkl'
joblib.dump(best_model, model_filename)
print(f"模型已保存到 {model_filename}")

# 加载模型进行预测
loaded_model = joblib.load(model_filename)

def predict_text(text):
"""
预测新文本的类别
"""
# 预处理
processed_text = preprocess_text(text)

# 预测
prediction = loaded_model.predict([processed_text])
probability = loaded_model.predict_proba([processed_text])

# 获取类别名称
predicted_class = newsgroups.target_names[prediction[0]]

# 获取所有类别的概率
class_probabilities = {
newsgroups.target_names[i]: prob
for i, prob in enumerate(probability[0])
}

return predicted_class, class_probabilities

# 测试预测函数
test_texts = [
"Computer graphics and image processing techniques",
"Medical research and healthcare innovations",
"Religious discussions and beliefs"
]

print("\n预测示例:")
for text in test_texts:
predicted_class, probabilities = predict_text(text)
print(f"\n文本: {text[:50]}...")
print(f"预测类别: {predicted_class}")
print(f"各类别概率: {probabilities}")

💡 四、高级技术和最佳实践

4.1 处理不平衡数据

1
2
3
4
5
6
7
8
9
10
11
12
13
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

# 使用SMOTE处理不平衡数据
smote_pipeline = make_pipeline(
TfidfVectorizer(max_features=5000),
SMOTE(random_state=42),
LogisticRegression(max_iter=1000, random_state=42)
)

smote_pipeline.fit(X_train, y_train)
y_pred_smote = smote_pipeline.predict(X_test)
print(f"使用SMOTE后的准确率: {accuracy_score(y_test, y_pred_smote):.4f}")

4.2 使用深度学习模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

# 准备数据
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# 填充序列
max_length = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

# 构建LSTM模型
model = Sequential([
Embedding(input_dim=5000, output_dim=128, input_length=max_length),
Bidirectional(LSTM(64, return_sequences=True)),
Dropout(0.5),
Bidirectional(LSTM(32)),
Dense(64, activation='relu'),
Dropout(0.5),
Dense(len(categories), activation='softmax')
])

model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)

# 训练模型
history = model.fit(
X_train_pad, y_train,
validation_split=0.2,
epochs=10,
batch_size=32,
verbose=1
)

# 评估模型
test_loss, test_accuracy = model.evaluate(X_test_pad, y_test, verbose=0)
print(f"深度学习模型测试准确率: {test_accuracy:.4f}")

五、总结与建议

通过本文的学习,你应该已经掌握了:

  1. NLP基础概念:理解了NLP的基本任务和处理流程
  2. 文本预处理技术:学会了如何清洗和准备文本数据
  3. 文本表示方法:掌握了词袋模型、TF-IDF和词嵌入
  4. 模型构建技能:能够构建和优化文本分类器
  5. 实践应用能力:实现了完整的情感分析系统

实用建议:

  1. 数据质量至关重要:花时间在数据清洗和预处理上
  2. 特征工程是关键:尝试不同的文本表示方法
  3. 模型选择要合理:根据数据规模和任务复杂度选择模型
  4. 持续优化:使用交叉验证和网格搜索优化超参数
  5. 考虑业务需求:平衡模型的准确性、速度和可解释性

自然语言处理是一个快速发展的领域,新的模型和技术不断涌现。建议持续学习最新的研究成果,如Transformer架构、BERT、GPT等预训练模型,这些技术正在推动NLP领域向前发展。

记住,实践是最好的学习方式。尝试将所学知识应用到实际项目中,解决真实世界的问题,这将帮助你更深入地理解NLP技术。

[up主专用,视频内嵌代码贴在这]