我在文件中有以下示例文本,我想从中删除重复项。最终目标是从此文件中删除所有重复实例(其中之一是 web:webapi)。
请注意,这是一个 600+ MB 的文件。
"nirmal" -> ["app:am","app:am","app:identity_gateway","app:identity_gateway","app:loginsvc","app:loginsvc","app:loginui","app:loginui","app:ticket","app:ticket","app:webapi","app:webapi","ds:config_store","ds:config_store","ds:cts_store","ds:cts_store","ds:user_store","ds:user_store","web:am","web:am","web:identity_gateway","web:identity_gateway","web:loginsvc","web:loginsvc","web:loginui","web:loginui","web:ticket","web:ticket","web:webapi","web:webapi"];
"mbl" -> ["app:phx","web:phx","app:vas","development:mobile","s2:detsvc","s2core:detsvc","txn:detsvc","web:detsvc","app:fidoproxy","app:landing","app:mobile","app:noknok","app:optchart","app:redis","app:sentinel","app:spring","cws:mesg","cws3:wsproxy","s2:billpay","s2:services","s2core:billpay","s2core:services","web:fidoproxy","web:spring","at:admin","at:eqsroll","at:oqsroll","batch:admin","cws:ctnt","cws:risk","cws:user","cws3:acctaggtr","cws3:content","cws3:risk","cws3:rtao","cws3:rtmm","ets:ord","fhs:eqs","fhs:oqs","s2:aarcomm","s2:acctcomm","s2:espsvc","s2:ibsvc","s2core:aarcomm","s2core:espsvc","s2core:ibsvc","txb:b2bsvc","txn:acct","txn:ibank2","txn:olsvc","txn:rtmm","txn:services","txn:wtools","web:aempros_mpublish","web:b2b","web:etsecxml","web:ibxml","web:olxml","web:prospect","web:tablet","web:ticket","web:wtxml","web:xmlacct","web:xmlrtmm","s2:asset","s2core:asset","app:phxcfgsvr","app:phxdshbrd","app:webapiagg","s2core:mblsvc","s2core:snapquotes","s2:mblsvc","s2:snapquotes","web:landing","web:mobile","web:phxcfgsvr","web:phxdshbrd","web:webapiagg","app:phxcfgsvr","app:phxdshbrd","app:webapiagg","s2core:mblsvc","s2core:snapquotes","s2:mblsvc","s2:snapquotes","web:landing","web:mobile","web:phxcfgsvr","web:phxdshbrd","web:webapiagg","app:phxcfgsvr","app:phxdshbrd","app:webapiagg","s2core:mblsvc","s2core:snapquotes","s2:mblsvc","s2:snapquotes","web:landing","web:mobile","web:phxcfgsvr","web:phxdshbrd","web:webapiagg","app:phxcfgsvr","app:phxdshbrd","app:webapiagg","s2core:mblsvc","s2core:snapquotes","s2:mblsvc","s2:snapquotes","web:landing","web:mobile","web:phxcfgsvr","web:phxdshbrd","web:webapiagg","app:phxcfgsvr","app:phxdshbrd","app:webapiagg","s2core:mblsvc","s2core:snapquotes","s2:mblsvc","s2:snapquotes","web:landing","web:mobile","web:phxcfgsvr","web:phxdshbrd","web:webapiagg","app:phxcfgsvr","app:phxdshbrd","app:webapiagg","s2core:mblsvc","s2core:snapquotes","s2:mblsvc","s2:snapquotes","web:landing","web:mobile","web:phxcfgsvr","web:phxdshbrd","web:webapiagg","app:phxcfgsvr","app:phxdshbrd","app:webapiagg","s2core:mblsvc","s2core:snapquotes","s2:mblsvc","s2:snapquotes","web:landing","web:mobile","web:phxcfgsvr","web:phxdshbrd","web:webapiagg","app:phxcfgsvr","app:phxdshbrd","app:webapiagg","s2core:mblsvc","s2core:snapquotes","s2:mblsvc","s2:snapquotes","web:landing","web:mobile","web:phxcfgsvr","web:phxdshbrd","web:webapiagg"];
我们如何在 Linux 中做到这一点?
每行具有相同格式文本的完整文件。我正在尝试搜索每个文件的第一个字符串(以“->”分隔),然后查找其值中以逗号分隔的重复项。如果我们发现任何重复的内容,那么它应该删除它们。
答案1
和sed
:
sed -e :1 -e 's/\("[^",]*"\)\(.*\),\1/\1\2/;t1'
:1
标记循环的跳转标记"[^",]*"
是一个字段。从模式中排除逗号可以避免将其","
视为字段。通过放入该字段,\(\)
我们可以反向引用相同的字段\1
- 该
s
命令删除第二次出现的同一字段以及逗号 - 如果进行了替换,则
t
命令跳转到开头的跳转标记
答案2
这是一种方法:
$ perl -lne '/^(.*?->\s*\[)(.*)(\].*)/; $k{$_}++ for split(/,/,$2);
print "$1", join ",", keys(%k), "$3"' file
"nirmal" -> ["web:webapi","web:identity_gateway","ds:config_store","app:ticket","ds:user_store","web:loginsvc","ds:cts _store","web:loginui","web:am","ds:cts_store","app:loginui","app:am","app:identity_gateway","app:webapi","web:ticket","app:loginsvc",];
"mbl" -> ["s2:acctcomm","cws:mesg","txn:olsvc","app:loginsvc","web:b2b","app:loginui","app:optchart","app:phxcfgsvr","cws3:risk","s2core:billpay","s2:detsvc","app:spring","app:phxdshbrd","ds:user_store","web:ticket","batch:admin","at:eqsroll","s2:asset","s2core:mblsvc","txn:acct","app:am","s2:espsvc","development:mobile","web:fidoproxy","app:webapi","txn:rtmm","s2:mblsvc","app:redis","cws:user","cws3:acctaggtr","ds:cts_store","txn:detsvc","web:mobile","app:webapiagg","txb:b2bsvc","fhs:oqs","cws3:wsproxy","web:landing","web:olxml","fhs:eqs","web:prospect","s2core:ibsvc","cws:risk","web:phx","s2:ibsvc","s2core:espsvc","txn:services","web:ibxml","web:tablet","at:admin","web:identity_gateway","web:spring","web:phxdshbrd","web:phxcfgsvr","s2core:snapquotes","app:sentinel","s2core:asset","ets:ord","cws3:rtmm","web:loginui","txn:wtools","web:loginsvc","s2:snapquotes","app:fidoproxy","web:etsecxml","s2:aarcomm","web:am","web:wtxml","app:noknok","ds:config_store","app:ticket","txn:ibank2","s2core:services","s2:billpay","web:detsvc","app:landing","cws3:content","web:aempros_mpublish","s2core:aarcomm","app:mobile","web:webapiagg","s2core:detsvc","web:webapi","cws3:rtao","app:identity_gateway","web:xmlrtmm","web:xmlacct","ds:cts _store","s2:services","at:oqsroll","app:vas","app:phx","cws:ctnt",];
解释
perl -lne
-e
:在输入文件的每一行上运行由 给出的脚本(-n
)。-l
从输入中删除尾随换行符,并为每次调用添加一个换行符print
。/^(.*?->\s*\[)(.*)(\].*)/
:在每个输入行上匹配三组数据。第一个 ,.*?->\s*\[
是从文件开头到第一个 的所有内容->
,然后是 0 个或多个空白字符,然后是\[
。因为该模式位于括号中,所以我们可以将其称为$1
。接下来,匹配所有内容,直到最后一个]
(.*)
)。这会是$2
。最后,匹配该行的其余部分 (\].*
),这将是$3
。$k{$_}++ for split(/,/,$2);
:这是删除重复项的技巧。通过拆分将$2
(重复数据)放入数组中,
,然后将该数组的每个元素作为哈希中的键%k
。由于哈希键始终是唯一的,这将消除$2
.print "$1", join ",", keys(%k), "$3"'
:现在,我们打印原始的、行的开头,然后是用逗号连接的$1
哈希键,最后是行的其余部分, 。这不会保留原始输入顺序,但会删除重复数据。%k
$3