删除文本文件中重叠的 IP 范围

Question

如果您同意输入行重新排序，我有一个使用 GNU Awk 和“sort”命令的相对简单的解决方案。基本思想是将 IP 地址转换为单个数字而不是点对，这使得比较它们变得非常容易，并使用-k排序标志，该标志允许指定它应该只对特定字段进行排序。

为了紧凑性，这还使用了协进程的 GNU awk 功能，这使得在使用之前和之后处理数据变得非常容易sort：

编辑：这个答案的原始版本中的命令sort行略有错误：sort -k2,3r实际上将字段2和3视为单个键，以相反的顺序排序。sort -k2,2n -k3,3rn将执行必要的操作，首先按字段排序2并使用（反向）字段3作为决胜局：

# Run as: gawk -F: -f <thisfile.awk> <input file>
BEGIN {
  # Define the sort command that we will be using later as a variable
  # Sort by
  #   - the 1st ip, smallest-to-largest
  #   - the 2nd ip, largest-to-smallest
  sort="sort -n -t: -k2,2n -k3,3nr";
}

# For every line:
{
  # Store the individual components of the addresses into 'ips'
  match($2, /([[:digit:]]+).([[:digit:]]+).([[:digit:]]+).([[:digit:]]+)\
-([[:digit:]]+).([[:digit:]]+).([[:digit:]]+).([[:digit:]]+)/, ips);
  # Add the components together to get the IPs as a single number.
  # The print also uses : as the delimiter between the 2 IPS for simplicity
  print $1":"ips[4]+256*(ips[3]+256*(ips[2]+256*ips[1])) \
          ":"ips[8]+256*(ips[7]+256*(ips[6]+256*ips[5])) \
    |& sort
}

# After sending all lines to sort in the appropriate format
END {
  # Close sort's input stream, so that we can read its output
  close(sort, "to");
  # Keep track of the upper end of the previous range
  prevHigh=0;
  # Read & field-split all lines from sort's output
  while((sort |& getline) > 0) {
     # One range is contained in another if its low address is >= the
     # other's (guaranteed by the sort command) and its high address is <=
     # the other's. So, we should print this record when its high address is >
     # prevHigh:
    if ($3 > prevHigh) {
      print $1":"int($2/(256*256*256))%256"."int($2/(256*256))%256"." \
                 int($2/256)%256"."$2%256 \
              "-"int($3/(256*256*256))%256"."int($3/(256*256))%256"." \
                 int($3/256)%256"."$3%256 \
      # This is now the previous range
      prevHigh = $3
    }
  }
}

Answer 1

如果您同意输入行重新排序，我有一个使用 GNU Awk 和“sort”命令的相对简单的解决方案。基本思想是将 IP 地址转换为单个数字而不是点对，这使得比较它们变得非常容易，并使用-k排序标志，该标志允许指定它应该只对特定字段进行排序。

为了紧凑性，这还使用了协进程的 GNU awk 功能，这使得在使用之前和之后处理数据变得非常容易sort：

编辑：这个答案的原始版本中的命令sort行略有错误：sort -k2,3r实际上将字段2和3视为单个键，以相反的顺序排序。sort -k2,2n -k3,3rn将执行必要的操作，首先按字段排序2并使用（反向）字段3作为决胜局：

# Run as: gawk -F: -f <thisfile.awk> <input file>
BEGIN {
  # Define the sort command that we will be using later as a variable
  # Sort by
  #   - the 1st ip, smallest-to-largest
  #   - the 2nd ip, largest-to-smallest
  sort="sort -n -t: -k2,2n -k3,3nr";
}

# For every line:
{
  # Store the individual components of the addresses into 'ips'
  match($2, /([[:digit:]]+).([[:digit:]]+).([[:digit:]]+).([[:digit:]]+)\
-([[:digit:]]+).([[:digit:]]+).([[:digit:]]+).([[:digit:]]+)/, ips);
  # Add the components together to get the IPs as a single number.
  # The print also uses : as the delimiter between the 2 IPS for simplicity
  print $1":"ips[4]+256*(ips[3]+256*(ips[2]+256*ips[1])) \
          ":"ips[8]+256*(ips[7]+256*(ips[6]+256*ips[5])) \
    |& sort
}

# After sending all lines to sort in the appropriate format
END {
  # Close sort's input stream, so that we can read its output
  close(sort, "to");
  # Keep track of the upper end of the previous range
  prevHigh=0;
  # Read & field-split all lines from sort's output
  while((sort |& getline) > 0) {
     # One range is contained in another if its low address is >= the
     # other's (guaranteed by the sort command) and its high address is <=
     # the other's. So, we should print this record when its high address is >
     # prevHigh:
    if ($3 > prevHigh) {
      print $1":"int($2/(256*256*256))%256"."int($2/(256*256))%256"." \
                 int($2/256)%256"."$2%256 \
              "-"int($3/(256*256*256))%256"."int($3/(256*256))%256"." \
                 int($3/256)%256"."$3%256 \
      # This is now the previous range
      prevHigh = $3
    }
  }
}

删除文本文件中重叠的 IP 范围

答案1

相关内容