删除 SVG/矢量图形文件中的重复绘图对象

Question

事实证明，在 Python 中使用 XML 非常简单。以下代码以递归方式（处理嵌套组）遍历文件。存储每个路径的哈希值（某些路径非常长），除非哈希值重复，在这种情况下将节点标记为删除。实际删除发生在返回父节点之前，因为如果即时执行，它似乎会破坏对子节点的迭代。白色填充对象的删除也在其中。

import xml.etree.ElementTree as ET
import hashlib as hash

def iter_groups(group):
    global hashlist
    global count
    rem=[]
    for child in group:
        if child.tag==rtag+'g' :#we have a group
            iter_groups(child)
        else:
            if child.tag==rtag+'path':
                h=hash.md5(str(child.attrib)).hexdigest()
                print h
                if h in hashlist:
                    rem.append(child)
                    print "removing ",child.tag, "in",group.tag,group.attrib," -- duplicate"
                    count+=1
                else:
                    try:
                        print child.attrib['fill']
                    except KeyError:
                        print 'no fill'
                        #no fill attribute
                    else:
                        if ("rgb(255,255,255)") in child.attrib['fill']:
                            rem.append(child)
                            print "removing ",child.tag, "in",group.tag,group.attrib," -- white"
                        else:
                            hashlist.append(h)
    for r in rem: group.remove(r)

#main#
hashlist=[] 
count=0 
tree = ET.parse('imgtest.svg')
root = tree.getroot()
rtag= root.tag.split('}')[0]+'}'
iter_groups(root)       
tree.write('imgtest_out.svg',encoding="us-ascii", xml_declaration=True, default_namespace="", method="xml")

问题：

由于某种原因，输出中的所有标签都以“ns0：”开头 - find&replace 可以解决这个问题
你可能会剩下很多空的组和未引用的 ID——运行冲刷之后用来覆盖文件--enable-id-stripping是一个好主意。

结果：初始文件：20,030KB 代码执行后：8,555KB 清理后：4,545KB 这在 inkscape 中几乎是可行的。

仍然有少量由略有不同的代码产生的视觉重复，以及一些功能上为空的组。

编辑上面的代码有几个错误，尤其是它实际上并没有删除白色对象。我还拼凑了一些东西来处理空组和只包含 1 个元素的组。虽然很丑陋，但无论如何它还是在这里。

import xml.etree.cElementTree as ET
import hashlib as hash
import copy

def get_attr(obj,attr):
    try:
        return obj.attrib[attr]
    except KeyError:
        return None
    else:
        return None

def iter_groups(group):
    global hashlist
    global count
    rem=[]
    for child in group:
        if child.tag==rtag+'g' :#we have a group
            iter_groups(child)
        else:
            if child.tag==rtag+'path':
                h=hash.md5(str(child.attrib)).hexdigest()
                print h
                if h in hashlist:
                    rem.append(child)
                    print "removing ",child.tag, "in",group.tag,group.attrib," -- duplicate"
                    count+=1
                else:   
                    if get_attr(child,'fill')!=None:
                        if ("rgb(255,255,255)") in child.attrib['fill']:
                            print "removing ",get_attr(child,'id'), "in",group.tag,group.attrib," -- white"
                            rem.append(child)
                        else:
                            hashlist.append(h)
    for r in rem: 
        print "about to remove",r.attrib
        group.remove(r)
    rem=[]
    for child in group:
        if child.tag==rtag+'g' :#we have a group
            if len(child.findall('*'))==0:
                print "removing ",child.tag, "in",group.tag,group.attrib," -- empty"
                rem.append(child)
    for r in rem: group.remove(r)


def ungroup_singles(group):
    global count
    for child in group:
        #print child.tag,rtag
        if child.tag==rtag+'g' :#we have a group
            print "len(group",get_attr(child,'id'),")",len(child)
            if len(child)>1:
                ungroup_singles(child)
            else :
                if len(child)==1:
                    if (len(child[0])>=1)or(child[0].tag<>rtag+'g'):
                        print "about to promote",child[0].tag,get_attr(child[0],'id'),get_attr(child[0],'class')
                        print len(child[0])
                        moveelem=copy.deepcopy(child[0])
                        group.append(moveelem)
                        group.remove(child)
                        count+=1
                    else:
                        print "about to remove",child[0].tag,get_attr(child[0],'id'),get_attr(child[0],'class')
                    child.remove(child[0])
                    count+=1
                else:#i.e. len(child)==0
                    print "about to remove",child.tag,get_attr(child,'id'),get_attr(child,'class')
                    group.remove(child)
                    count+=1
        #else:
            # if gl==1:#and not clipped?
                #moveelem= ET.copy.deepcopy(child)

#main#
hashlist=[] 
count=0 
ET.register_namespace("","http://www.w3.org/2000/svg")
tree = ET.parse('imgtest_l.svg')
root = tree.getroot()
rtag= root.tag.split('}')[0]+'}'
iter_groups(root)
print "A", count," elements removed"
lcount=1
while True:
    count=0
    ungroup_singles(root)
    print lcount,":",count," empty groups removed / single elements promoted from groups"
    lcount+=1
    if count==0 or lcount>10:
        break

tree.write('imgtest_out.svg',encoding="us-ascii", xml_declaration=True, default_namespace="", method="xml")

Answer 1

事实证明，在 Python 中使用 XML 非常简单。以下代码以递归方式（处理嵌套组）遍历文件。存储每个路径的哈希值（某些路径非常长），除非哈希值重复，在这种情况下将节点标记为删除。实际删除发生在返回父节点之前，因为如果即时执行，它似乎会破坏对子节点的迭代。白色填充对象的删除也在其中。

import xml.etree.ElementTree as ET
import hashlib as hash

def iter_groups(group):
    global hashlist
    global count
    rem=[]
    for child in group:
        if child.tag==rtag+'g' :#we have a group
            iter_groups(child)
        else:
            if child.tag==rtag+'path':
                h=hash.md5(str(child.attrib)).hexdigest()
                print h
                if h in hashlist:
                    rem.append(child)
                    print "removing ",child.tag, "in",group.tag,group.attrib," -- duplicate"
                    count+=1
                else:
                    try:
                        print child.attrib['fill']
                    except KeyError:
                        print 'no fill'
                        #no fill attribute
                    else:
                        if ("rgb(255,255,255)") in child.attrib['fill']:
                            rem.append(child)
                            print "removing ",child.tag, "in",group.tag,group.attrib," -- white"
                        else:
                            hashlist.append(h)
    for r in rem: group.remove(r)

#main#
hashlist=[] 
count=0 
tree = ET.parse('imgtest.svg')
root = tree.getroot()
rtag= root.tag.split('}')[0]+'}'
iter_groups(root)       
tree.write('imgtest_out.svg',encoding="us-ascii", xml_declaration=True, default_namespace="", method="xml")

问题：

由于某种原因，输出中的所有标签都以“ns0：”开头 - find&replace 可以解决这个问题
你可能会剩下很多空的组和未引用的 ID——运行冲刷之后用来覆盖文件--enable-id-stripping是一个好主意。

结果：初始文件：20,030KB 代码执行后：8,555KB 清理后：4,545KB 这在 inkscape 中几乎是可行的。

仍然有少量由略有不同的代码产生的视觉重复，以及一些功能上为空的组。

编辑上面的代码有几个错误，尤其是它实际上并没有删除白色对象。我还拼凑了一些东西来处理空组和只包含 1 个元素的组。虽然很丑陋，但无论如何它还是在这里。

import xml.etree.cElementTree as ET
import hashlib as hash
import copy

def get_attr(obj,attr):
    try:
        return obj.attrib[attr]
    except KeyError:
        return None
    else:
        return None

def iter_groups(group):
    global hashlist
    global count
    rem=[]
    for child in group:
        if child.tag==rtag+'g' :#we have a group
            iter_groups(child)
        else:
            if child.tag==rtag+'path':
                h=hash.md5(str(child.attrib)).hexdigest()
                print h
                if h in hashlist:
                    rem.append(child)
                    print "removing ",child.tag, "in",group.tag,group.attrib," -- duplicate"
                    count+=1
                else:   
                    if get_attr(child,'fill')!=None:
                        if ("rgb(255,255,255)") in child.attrib['fill']:
                            print "removing ",get_attr(child,'id'), "in",group.tag,group.attrib," -- white"
                            rem.append(child)
                        else:
                            hashlist.append(h)
    for r in rem: 
        print "about to remove",r.attrib
        group.remove(r)
    rem=[]
    for child in group:
        if child.tag==rtag+'g' :#we have a group
            if len(child.findall('*'))==0:
                print "removing ",child.tag, "in",group.tag,group.attrib," -- empty"
                rem.append(child)
    for r in rem: group.remove(r)


def ungroup_singles(group):
    global count
    for child in group:
        #print child.tag,rtag
        if child.tag==rtag+'g' :#we have a group
            print "len(group",get_attr(child,'id'),")",len(child)
            if len(child)>1:
                ungroup_singles(child)
            else :
                if len(child)==1:
                    if (len(child[0])>=1)or(child[0].tag<>rtag+'g'):
                        print "about to promote",child[0].tag,get_attr(child[0],'id'),get_attr(child[0],'class')
                        print len(child[0])
                        moveelem=copy.deepcopy(child[0])
                        group.append(moveelem)
                        group.remove(child)
                        count+=1
                    else:
                        print "about to remove",child[0].tag,get_attr(child[0],'id'),get_attr(child[0],'class')
                    child.remove(child[0])
                    count+=1
                else:#i.e. len(child)==0
                    print "about to remove",child.tag,get_attr(child,'id'),get_attr(child,'class')
                    group.remove(child)
                    count+=1
        #else:
            # if gl==1:#and not clipped?
                #moveelem= ET.copy.deepcopy(child)

#main#
hashlist=[] 
count=0 
ET.register_namespace("","http://www.w3.org/2000/svg")
tree = ET.parse('imgtest_l.svg')
root = tree.getroot()
rtag= root.tag.split('}')[0]+'}'
iter_groups(root)
print "A", count," elements removed"
lcount=1
while True:
    count=0
    ungroup_singles(root)
    print lcount,":",count," empty groups removed / single elements promoted from groups"
    lcount+=1
    if count==0 or lcount>10:
        break

tree.write('imgtest_out.svg',encoding="us-ascii", xml_declaration=True, default_namespace="", method="xml")

删除 SVG/矢量图形文件中的重复绘图对象

答案1

相关内容