NLP 如何进行文本数据预处理?
文本数据预处理是 NLP 项目中的关键步骤,直接影响模型性能。高质量的预处理可以提升模型准确率、加速训练、减少噪声。文本预处理的重要性为什么需要预处理原始文本包含大量噪声不同来源的数据格式不统一模型需要标准化的输入提升模型性能和训练效率预处理目标清洗噪声数据标准化文本格式提取有用特征减少数据维度基础文本清洗1. 去除特殊字符HTML 标签from bs4 import BeautifulSoupdef remove_html(text): soup = BeautifulSoup(text, 'html.parser') return soup.get_text()URL 和邮箱import redef remove_urls_emails(text): text = re.sub(r'http\S+', '', text) text = re.sub(r'\S+@\S+', '', text) return text特殊符号def remove_special_chars(text): text = re.sub(r'[^\w\s]', '', text) return text2. 处理空白字符去除多余空格def remove_extra_spaces(text): text = re.sub(r'\s+', ' ', text) text = text.strip() return text处理换行符def normalize_newlines(text): text = text.replace('\r\n', '\n') text = text.replace('\r', '\n') return text3. 处理数字数字标准化def normalize_numbers(text): text = re.sub(r'\d+', '<NUM>', text) return text保留特定数字def preserve_specific_numbers(text): # 保留年份 text = re.sub(r'\b(19|20)\d{2}\b', '<YEAR>', text) # 保留电话号码 text = re.sub(r'\d{3}-\d{3}-\d{4}', '<PHONE>', text) return text文本标准化1. 大小写转换全部小写def to_lowercase(text): return text.lower()首字母大写def capitalize_first(text): return text.capitalize()标题格式def to_title_case(text): return text.title()2. 拼写纠正使用 TextBlobfrom textblob import TextBlobdef correct_spelling(text): blob = TextBlob(text) return str(blob.correct())使用 pyspellcheckerfrom spellchecker import SpellCheckerspell = SpellChecker()def correct_spelling(text): words = text.split() corrected = [spell.correction(word) for word in words] return ' '.join(corrected)3. 缩写展开常见缩写contractions = { "can't": "cannot", "won't": "will not", "n't": "not", "'re": "are", "'s": "is", "'d": "would", "'ll": "will", "'ve": "have", "'m": "am"}def expand_contractions(text): for contraction, expansion in contractions.items(): text = text.replace(contraction, expansion) return text分词1. 中文分词使用 jiebaimport jiebadef chinese_tokenization(text): return list(jieba.cut(text))使用 HanLPfrom pyhanlp import HanLPdef chinese_tokenization(text): return HanLP.segment(text)使用 LTPfrom ltp import LTPltp = LTP()def chinese_tokenization(text): return ltp.cut(text)2. 英文分词使用 NLTKfrom nltk.tokenize import word_tokenizedef english_tokenization(text): return word_tokenize(text)使用 spaCyimport spacynlp = spacy.load('en_core_web_sm')def english_tokenization(text): doc = nlp(text) return [token.text for token in doc]使用 BERT Tokenizerfrom transformers import BertTokenizertokenizer = BertTokenizer.from_pretrained('bert-base-uncased')def bert_tokenization(text): return tokenizer.tokenize(text)3. 子词分词WordPiecefrom transformers import BertTokenizertokenizer = BertTokenizer.from_pretrained('bert-base-uncased')def wordpiece_tokenization(text): return tokenizer.tokenize(text)BPE(Byte Pair Encoding)from transformers import GPT2Tokenizertokenizer = GPT2Tokenizer.from_pretrained('gpt2')def bpe_tokenization(text): return tokenizer.tokenize(text)SentencePieceimport sentencepiece as spmsp = spm.SentencePieceProcessor()sp.load('model.model')def sentencepiece_tokenization(text): return sp.encode_as_pieces(text)停用词处理1. 去除停用词英文停用词from nltk.corpus import stopwordsfrom nltk.tokenize import word_tokenizestop_words = set(stopwords.words('english'))def remove_stopwords(text): words = word_tokenize(text) filtered = [word for word in words if word.lower() not in stop_words] return ' '.join(filtered)中文停用词import jiebadef load_stopwords(file_path): with open(file_path, 'r', encoding='utf-8') as f: return set([line.strip() for line in f])stop_words = load_stopwords('chinese_stopwords.txt')def remove_chinese_stopwords(text): words = jieba.cut(text) filtered = [word for word in words if word not in stop_words] return ' '.join(filtered)2. 自定义停用词def create_custom_stopwords(): custom_stopwords = { '的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这' } return custom_stopwords词形还原和词干提取1. 词形还原(Lemmatization)使用 NLTKfrom nltk.stem import WordNetLemmatizerfrom nltk.corpus import wordnetlemmatizer = WordNetLemmatizer()def lemmatize(text): words = word_tokenize(text) lemmatized = [lemmatizer.lemmatize(word) for word in words] return ' '.join(lemmatized)使用 spaCyimport spacynlp = spacy.load('en_core_web_sm')def lemmatize(text): doc = nlp(text) return ' '.join([token.lemma_ for token in doc])2. 词干提取(Stemming)Porter Stemmerfrom nltk.stem import PorterStemmerstemmer = PorterStemmer()def stem(text): words = word_tokenize(text) stemmed = [stemmer.stem(word) for word in words] return ' '.join(stemmed)Snowball Stemmerfrom nltk.stem import SnowballStemmerstemmer = SnowballStemmer('english')def stem(text): words = word_tokenize(text) stemmed = [stemmer.stem(word) for word in words] return ' '.join(stemmed)文本增强1. 同义词替换from nltk.corpus import wordnetimport randomdef synonym_replacement(text, n=1): words = word_tokenize(text) new_words = words.copy() for _ in range(n): word_to_replace = random.choice(words) synonyms = [] for syn in wordnet.synsets(word_to_replace): for lemma in syn.lemmas(): if lemma.name() != word_to_replace: synonyms.append(lemma.name()) if synonyms: replacement = random.choice(synonyms) idx = new_words.index(word_to_replace) new_words[idx] = replacement return ' '.join(new_words)2. 随机删除def random_deletion(text, p=0.1): words = word_tokenize(text) if len(words) == 1: return text new_words = [] for word in words: if random.random() > p: new_words.append(word) if len(new_words) == 0: return random.choice(words) return ' '.join(new_words)3. 随机交换def random_swap(text, n=1): words = word_tokenize(text) for _ in range(n): if len(words) < 2: return text idx1, idx2 = random.sample(range(len(words)), 2) words[idx1], words[idx2] = words[idx2], words[idx1] return ' '.join(words)4. 回译(Back-translation)from googletrans import Translatortranslator = Translator()def back_translate(text, intermediate_lang='fr'): # 翻译到中间语言 translated = translator.translate(text, dest=intermediate_lang).text # 翻译回原语言 back_translated = translator.translate(translated, dest='en').text return back_translated特征提取1. TF-IDFfrom sklearn.feature_extraction.text import TfidfVectorizervectorizer = TfidfVectorizer(max_features=1000)tfidf_matrix = vectorizer.fit_transform(texts)2. N-gramfrom sklearn.feature_extraction.text import CountVectorizer# Unigramsunigram_vectorizer = CountVectorizer(ngram_range=(1, 1))# Bigramsbigram_vectorizer = CountVectorizer(ngram_range=(2, 2))# Trigramstrigram_vectorizer = CountVectorizer(ngram_range=(3, 3))3. 词向量Word2Vecfrom gensim.models import Word2Vecsentences = [word_tokenize(text) for text in texts]model = Word2Vec(sentences, vector_size=100, window=5, min_count=1)GloVefrom gensim.models import KeyedVectorsmodel = KeyedVectors.load_word2vec_format('glove.6B.100d.txt', binary=False)BERT 嵌入from transformers import BertModel, BertTokenizertokenizer = BertTokenizer.from_pretrained('bert-base-uncased')model = BertModel.from_pretrained('bert-base-uncased')def get_bert_embedding(text): inputs = tokenizer(text, return_tensors='pt') outputs = model(**inputs) return outputs.last_hidden_state.mean(dim=1)数据集处理1. 数据划分from sklearn.model_selection import train_test_splitX_train, X_test, y_train, y_test = train_test_split( texts, labels, test_size=0.2, random_state=42)X_train, X_val, y_train, y_val = train_test_split( X_train, y_train, test_size=0.25, random_state=42)2. 分层采样from sklearn.model_selection import StratifiedShuffleSplitsss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)for train_index, test_index in sss.split(texts, labels): X_train, X_test = [texts[i] for i in train_index], [texts[i] for i in test_index] y_train, y_test = [labels[i] for i in train_index], [labels[i] for i in test_index]3. 数据平衡from imblearn.over_sampling import RandomOverSamplerfrom imblearn.under_sampling import RandomUnderSampler# 过采样oversampler = RandomOverSampler()X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)# 欠采样undersampler = RandomUnderSampler()X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)完整预处理流程class TextPreprocessor: def __init__(self, language='english'): self.language = language self.setup_tools() def setup_tools(self): if self.language == 'english': from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer self.stop_words = set(stopwords.words('english')) self.lemmatizer = WordNetLemmatizer() elif self.language == 'chinese': import jieba self.stop_words = self.load_chinese_stopwords() def preprocess(self, text): # 清洗 text = self.clean_text(text) # 标准化 text = self.normalize_text(text) # 分词 tokens = self.tokenize(text) # 去除停用词 tokens = self.remove_stopwords(tokens) # 词形还原 tokens = self.lemmatize(tokens) return ' '.join(tokens) def clean_text(self, text): # 去除 HTML 标签 from bs4 import BeautifulSoup text = BeautifulSoup(text, 'html.parser').get_text() # 去除 URL text = re.sub(r'http\S+', '', text) # 去除特殊字符 text = re.sub(r'[^\w\s]', '', text) # 去除多余空格 text = re.sub(r'\s+', ' ', text) return text.strip() def normalize_text(self, text): # 小写化 text = text.lower() # 展开缩写 text = self.expand_contractions(text) return text def tokenize(self, text): if self.language == 'english': return word_tokenize(text) elif self.language == 'chinese': return list(jieba.cut(text)) def remove_stopwords(self, tokens): return [token for token in tokens if token.lower() not in self.stop_words] def lemmatize(self, tokens): if self.language == 'english': return [self.lemmatizer.lemmatize(token) for token in tokens] return tokens最佳实践1. 预处理顺序清洗文本(去除噪声)标准化(大小写、缩写)分词去除停用词词形还原/词干提取特征提取2. 避免过度处理保留重要信息考虑任务需求不要过度清洗3. 一致性保持处理流程一致记录所有步骤可复现性4. 性能优化批量处理并行化缓存结果工具和库Python 库NLTK:经典 NLP 工具spaCy:工业级 NLPjieba:中文分词HanLP:中文 NLPTextBlob:简单易用gensim:词向量预训练模型BERT:上下文嵌入GPT:生成模型T5:文本到文本总结文本数据预处理是 NLP 项目的基础,直接影响模型性能。选择合适的预处理方法需要考虑任务需求、数据特征和模型类型。通过系统化的预处理流程,可以显著提升模型性能和训练效率。