文本预处理(text preprocess)总结

在任何机器学习任务中,清理(cleaning )或预处理(preprocessing)数据与模型构建同样重要,甚至更重要。 当涉及文本等非结构化数据时,这个过程就更加重要。

1. 小写化(Lower Casing)

小写是一种常见的文本预处理技术。 这个想法是将输入文本转换为相同的大小写格式,以便以相同的方式处理 'text'、'Text' 和 'TEXT'。

    def lower_casing(self, text):
        return text.lower()

2. 删除标点符号(Removal of Punctuations)

另一种常见的文本预处理技术是从文本数据中删除标点符号。 这又是一个文本标准化过程,将有助于处理“hurray”和“hurray!” 以同样的方式。

    #     PUNCT_TO_REMOVE = """!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~`‘"""
    def remove_punctuation(self, text):

        return text.translate(str.maketrans('', '', self.PUNCT_TO_REMOVE))

3. 删除停用词(Removal of stopwords)

停用词是语言中常见的单词,如“the”、“a”等。 大多数时候它们可以从文本中删除,因为它们不为下游分析提供有价值的信息。 在像词性标记这样的情况下,我们不应该删除它们,因为它们提供了有关 POS 的非常有价值的信息。

    def remove_stopwords(self, text):
        """custom function to remove the stopwords"""
        return " ".join([word for word in str(text).split() if word not in self.STOPWORDS])

4. 删除常用词(Removal of Frequent words)

在前面的预处理步骤中,我们根据语言信息删除了停用词。 但是,如果我们有一个特定领域的语料库,我们可能还会有一些对我们来说不太重要的频繁出现的单词。

所以这一步就是去除给定语料库中的频繁出现的单词。 如果我们使用 tfidf 之类的东西,就会自动解决这个问题。

from collections import Counter
cnt = Counter()
for text in df["text"].values:
    for word in text.split():
        cnt[word] += 1

5. 删除不经常用的词(Removal of Rare words)


n_rare_words = 10
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(text):
    """custom function to remove the rare words"""
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

df["text"] = df["text"].apply(lambda text: remove_rarewords(text))

6. 词干提取(Stemming)


例如,如果语料库中有两个单词walks和walking,那么词干提取就会对后缀进行词干处理,使它们成为walking。 但在另一个例子中,我们有两个单词 console 和 consoling,词干分析器将删除后缀并使它们成为 consol,这不是一个正确的英语单词。

有多种类型的词干算法可用,其中最著名的一种是广泛使用的 porter 词干分析器。 我们可以使用 nltk 包来实现同样的目的。

    #  self.stemmer = PorterStemmer()
    def stem_words(self, text):
        return " ".join([self.stemmer.stem(word) for word in text.split()])

7. 词形还原(Lemmatization)


因此,这一过程通常比词干提取过程慢。 因此,根据速度要求,我们可以选择使用词干提取或词形还原。

让我们使用 nltk 中的 WordNetLemmatizer 来对句子进行词形还原

    #  self.lemmatizer = WordNetLemmatizer()
    def lemmatize_words(self, text):
        return " ".join([self.lemmatizer.lemmatize(word) for word in text.split()])

8. 删除表情符号(Removal of Emojis)

随着社交媒体平台的使用越来越多,表情符号在我们日常生活中的使用也呈爆炸式增长。 也许我们可能需要删除这些表情符号以进行一些文本分析。


# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

remove_emoji("game is on 🔥🔥")

9. 删除表情符号(Removal of Emoticons)


def remove_emoticons(text):
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
    return emoticon_pattern.sub(r'', text)

remove_emoticons("Hello :-)")

10. 替换或删除Http url

     def remove_urls(self, text):
        url_pattern = re.compile(r'https?://\S+|www\.\S+')
        return url_pattern.sub(r'', text)
     def replace_http_url(self, text,  word = "urladd"):

        return re.sub(r'https?://\S+|www\.\S+', word, text)

11. 替换邮件地址

    def replace_email_id(self, text,  word = "emailadd"):
        return re.sub(r"([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})", word,text)

12.  替换数字


    def replace_digit(self, text,  word = "digitadd"):
        return re.sub('\d+', word, text)

13. 删掉多余空格和换行

    def remove_extra_space(self, text):
        return re.sub(' +', ' ', text)

    def remove_line_break_space(self, text):
        # return text.replace('\n', ' ').replace('\r', '')
        return " ".join([word for word in text.split()])

14. 提取html标签里内容并删除

    def remove_html(self, text):

        return BeautifulSoup(text, features='html5lib').text

15. 缩写还原

# import library
import contractions
# contracted text
text = '''I'll be there within 5 min. Shouldn't you be there too? 
          I'd love to see u there my dear. It's awesome to meet new friends.
          We've been waiting for this day for so long.'''
# creating an empty list
expanded_words = []    
for word in text.split():
  # using contractions.fix to expand the shortened words
expanded_text = ' '.join(expanded_words)
print('Original text: ' + text)
print('Expanded_text: ' + expanded_text)


from bs4 import BeautifulSoup
import lxml
import re
from nltk.corpus import stopwords
import nltk
from nltk.stem.porter import PorterStemmer
from spellchecker import SpellChecker
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import contractions

class text_preprocessing():

    PUNCT_TO_REMOVE = """!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~`‘"""

    def __init__(self):
        self.STOPWORDS = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()

        self.lemmatizer = WordNetLemmatizer()
        self.spell = SpellChecker()
        self.wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}

    def expand_contractions(self, text):
        expanded_text = contractions.fix(text)
        return expanded_text

    def lemmatize_words(self, text):
        return " ".join([self.lemmatizer.lemmatize(word) for word in text.split()])

    def lemmatize_words_position(self, text):
        pos_tagged_text = nltk.pos_tag(text.split())
        return " ".join(
            [self.lemmatizer.lemmatize(word, self.wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

    def remove_punctuation(self, text):
        """custom function to remove the punctuation"""
        return text.translate(str.maketrans('', '', self.PUNCT_TO_REMOVE))

    def remove_space(self, text):
        return text.replace("_x000D_", " ")
    def remove_extra_space(self, text):
        return re.sub(' +', ' ', text)

    def remove_line_break_space(self, text):
        # return text.replace('\n', ' ').replace('\r', '')
        return " ".join([word for word in text.split()])
    def remove_html(self, text):
        return BeautifulSoup(text, features='html5lib').text

    def lower_casing(self, text):
        return text.lower()

    def remove_urls(self, text):
        url_pattern = re.compile(r'https?://\S+|www\.\S+')
        return url_pattern.sub(r'', text)

    def remove_stopwords(self, text):
        """custom function to remove the stopwords"""
        return " ".join([word for word in str(text).split() if word not in self.STOPWORDS])

    def stem_words(self, text):
        return " ".join([self.stemmer.stem(word) for word in text.split()])

    def remove_words(self, text, words):
        return " ".join([word for word in str(text).split() if word not in words])

    def correct_spellings(self, text):
        corrected_text = []
        misspelled_words = self.spell.unknown(text.split())
        for word in text.split():
            if word in misspelled_words:
        if len(corrected_text) == 0:
            return  ""
        return " ".join(corrected_text)

    def replace_http_url(self, text,  word = "urladd"):

        return re.sub(r'https?://\S+|www\.\S+', word, text)

    def replace_email_id(self, text,  word = "emailadd"):
        return re.sub(r"([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})", word,text)

    def replace_digit(self, text,  word = "digitadd"):
        return re.sub('\d+', word, text)

if __name__ == '__main__':
    text_preprocessing = text_preprocessing()

    text ="""
    this text is for test

    text = text_preprocessing.replace_email_id(text)
    text = text_preprocessing.replace_http_url(text)
    text = text_preprocessing.replace_digit(text)
    text = text_preprocessing.expand_contractions(text)

    # text = text_preprocessing.remove_extra_space(text)
    # print('after removing extra space:', text)
    # old_text= text_preprocessing.remove_line_break_space(text)
    # print('old_text:',old_text)
    # text = text_preprocessing.lemmatize_words(old_text)
    # print("lemmatize_words_position:", text)
    # text = text_preprocessing.stem_words(old_text)
    # print("stem_words:",text)

Padas 处理的文字代码

import pandas as pd
from pandas import DataFrame
from tabulate import tabulate
from nlp.text_preprocessing_util.text_preprocessing import *

base_dir = "C:/apps/ml_datasets"

text_preprocessing = text_preprocessing()

def get_dataset():
    data = pd.read_excel(base_dir+'/Support_email_category.xlsx', sheet_name='Emails')
    #data = pd.read_excel('../dataset/final.xlsx', sheetname='final')

    # data = data.apply(preprocess_data, axis=1)

    X_orig = data['Subject'].astype(str) +" "+  data['Email content']
    y_orig = data['Priority']

    new_data = pd.DataFrame()

    new_data['X_orig'] = X_orig
    new_data['y_orig'] = y_orig
    new_data.to_excel(base_dir+'/raw_data.xlsx', index=None)

    return new_data

def lower_casing(data: DataFrame):
    data["X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.lower_casing(text))
    return data

def remove_space(data: DataFrame):
    data["X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.remove_space(text))

    return data

def remove_punctuation(data: DataFrame):
    data["X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.remove_punctuation(text))
    return data

def remove_stopwords(data: DataFrame):
    data["X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.remove_stopwords(text))
    return data

def correct_spellings(data: DataFrame):
    data["X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.correct_spellings(text))
    return data

def remove_freqwords(data: DataFrame):
    from collections import Counter
    cnt = Counter()
    for text in data["X_orig"].values:
        for word in text.split():
            cnt[word] += 1


    FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
    data["X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.remove_words(text, FREQWORDS))
    return data

def remove_rare_words(data: DataFrame):
    from collections import Counter
    cnt = Counter()
    for text in data["X_orig"].values:
        for word in text.split():
            cnt[word] += 1

    n_rare_words = 10
    RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words - 1:-1]])
    print("rarewords:", RAREWORDS)
    data["X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.remove_words(text, RAREWORDS))
    return data

def stem_words(data: DataFrame):

    data["X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.stem_words(text))
    return data

def lemmatize_words(data: DataFrame):
    data["X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.lemmatize_words(text))
    return data

def lemmatize_words1(data: DataFrame):
    import nltk
    from nltk.stem import WordNetLemmatizer
    from nltk.corpus import wordnet
    lemmatizer = WordNetLemmatizer()
    wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}

    data["X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.lemmatize_words_position(text, lemmatizer, wordnet_map))
    return data

def remove_urls(data: DataFrame):
    data["X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.remove_urls(text))
    return data

def remove_html(data: DataFrame):
    data["X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.remove_html(text))
    return data

def process_abbreviated_words(data: DataFrame):
    data["after X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.chat_words_conversion(text ))
    return data

def texts_preprocessing(data: DataFrame):

    data = remove_space(data)
    data = remove_urls(data)
    data= remove_html(data)
    data= process_abbreviated_words(data)
    data = lower_casing(data)
    # print(tabulate(data.head(3)))
    data = remove_punctuation(data)
    data = remove_stopwords(data)
    data= remove_freqwords(data)
    data = remove_rare_words(data)
    # data = stem_words(data)
    # data = lemmatize_words(data)
    data = lemmatize_words1(data)
    # data = correct_spellings(data)
    return data

def save_file(data, file_name):
    data.to_excel(base_dir + '/'+file_name, index=None)

if __name__ == '__main__':
    data =  get_dataset()

    data = texts_preprocessing(data)
    save_file(data, 'after_preprocessing.xlsx')


