python版 html正文提取(CEPF)

from selectolax.parser import *
import math



class CountInfo:
    def __init__(self):
        self.textCount = 0
        self.linkTextCount = 0
        self.tagCount = 0
        self.linkTagCount = 0
        self.density = 0
        self.densitySum = 0
        self.score = 0
        self.pCount = 0
        self.leafList = []

    def __str__(self) -> str:
        return f"textCount: {self.textCount}, linkTextCount: {self.linkTextCount}, tagCount: {self.tagCount}, linkTagCount: {self.linkTagCount}, density: {self.density}, densitySum: {self.densitySum}, score: {self.score}, pCount: {self.pCount}, leafList: {self.leafList}"


class ContentExtractor:

    def __init__(self):
        pass

    def reload(self, content):
        self.doc = HTMLParser(content)
        self.infoMap = []

    def clear(self):
        tags = ["script", "noscript", "style", "iframe", "br"]
        self.doc.strip_tags(tags)

    def computeInfo(self, node):
        if node.tag != "-text":
            countInfo = CountInfo()
            for child_node in node.iter(include_text=True):
                childCountInfo = self.computeInfo(child_node)
                countInfo.textCount += childCountInfo.textCount
                countInfo.linkTextCount += childCountInfo.linkTextCount
                countInfo.tagCount += childCountInfo.tagCount
                countInfo.linkTagCount += childCountInfo.linkTagCount
                countInfo.leafList.extend(childCountInfo.leafList)
                countInfo.densitySum += childCountInfo.density
                countInfo.pCount += childCountInfo.pCount
            countInfo.tagCount += 1
            tagname = node.tag
            if tagname == "a":
                countInfo.linkTextCount = countInfo.textCount
                countInfo.linkTagCount += 1
            elif tagname == "p":
                countInfo.pCount += 1

            pureLen = countInfo.textCount - countInfo.linkTextCount
            tag_len = countInfo.tagCount - countInfo.linkTagCount

            if pureLen == 0 or tag_len == 0:
                countInfo.density = 0
            else:
                countInfo.density = pureLen / tag_len

            self.infoMap.append({"node": node, "countInfo": countInfo})
            return countInfo
        else:
            countInfo = CountInfo()
            text = node.text_content
            text_len = len(text)
            countInfo.textCount = text_len
            countInfo.leafList.append(text_len)
            return countInfo

    def computerVar(self, data):
        """方差"""
        if not data:
            return 0
        if len(data) == 1:
            return data[0] / 2
        avg = sum(data) / len(data)
        return sum((x - avg) ** 2 for x in data) / len(data)

    def computeScore(self, countInfo):
        "计算得分"

        sqrt = math.sqrt(self.computerVar(countInfo.leafList) + 1)
        score = (
            math.log(sqrt)
            * countInfo.densitySum
            * math.log(countInfo.textCount - countInfo.linkTextCount + 1)
            * math.log10(countInfo.pCount + 2)
        )
        return score

    def getContentElement(self):
        self.clear()
        if not self.doc.body:

            return ""
        self.computeInfo(self.doc.body)
        content = None
        maxScore = 0

        for obj in self.infoMap:
            node = obj.get("node")
            if node.tag == "a" or node.tag == "body":
                continue
            score = self.computeScore(obj.get("countInfo"))
            if score > maxScore:
                maxScore = score
                content = node

        return content

相关推荐

  1. python html正文提取(CEPF)

    2024-06-13 11:24:09       6 阅读
  2. python导入导出excel、python提取html正文

    2024-06-13 11:24:09       19 阅读
  3. python如何提取html中所有中文

    2024-06-13 11:24:09       8 阅读
  4. Python提取xml节点

    2024-06-13 11:24:09       29 阅读
  5. 图像特征提取 python

    2024-06-13 11:24:09       8 阅读

最近更新

  1. TCP协议是安全的吗?

    2024-06-13 11:24:09       14 阅读
  2. 阿里云服务器执行yum,一直下载docker-ce-stable失败

    2024-06-13 11:24:09       16 阅读
  3. 【Python教程】压缩PDF文件大小

    2024-06-13 11:24:09       15 阅读
  4. 通过文章id递归查询所有评论(xml)

    2024-06-13 11:24:09       18 阅读

热门阅读

  1. alzet微量渗透泵说明书(上篇)

    2024-06-13 11:24:09       6 阅读
  2. 网络限速导致的服务器访问https异常得处理过程

    2024-06-13 11:24:09       7 阅读
  3. QT:一个通用工程文件配置,可以存储为common.pri

    2024-06-13 11:24:09       7 阅读
  4. 【Qt快速入门(四)】- QLabel文本框的使用

    2024-06-13 11:24:09       7 阅读
  5. 【RSA加密解密】

    2024-06-13 11:24:09       4 阅读
  6. 6.5 c语言

    2024-06-13 11:24:09       4 阅读
  7. css3新增的伪类有哪些

    2024-06-13 11:24:09       5 阅读
  8. SQL调优方案

    2024-06-13 11:24:09       3 阅读
  9. python中的数据分析(juypter)

    2024-06-13 11:24:09       6 阅读