有没有一种方法可以根据一列提取 3 个文件之间共有的行？

Question 1

这应该既快速又高效。它使用join（部分GNU 核心工具) 和field( 的一部分探戈工具）：

# This finds the matching lines and makes it into a big file with 25 columns
join -1 3 -2 3 neu1 nep1 | join -1 1 -2 3 - ret1 |
  # Now split the right columns
  # field is https://gitlab.com/ole.tange/tangetools/-/tree/master/field
  tee > >(field 2-3,1,4-9 > neu2) >(field 10-11,1,12-17 > nep2) >(field 18-19,1,20-25 > ret2)

Answer

这应该既快速又高效。它使用join（部分GNU 核心工具) 和field( 的一部分探戈工具）：

# This finds the matching lines and makes it into a big file with 25 columns
join -1 3 -2 3 neu1 nep1 | join -1 1 -2 3 - ret1 |
  # Now split the right columns
  # field is https://gitlab.com/ole.tange/tangetools/-/tree/master/field
  tee > >(field 2-3,1,4-9 > neu2) >(field 10-11,1,12-17 > nep2) >(field 18-19,1,20-25 > ret2)

Question 2

$ cat ../tst.awk
ARGIND < 4 {
    if ( FNR == 1 ) {
        ARGV[ARGC] = FILENAME
        ARGC++
    }
    if ( !seen[ARGIND,$3]++ ) {
        cnt[$3]++
    }
    next
}
FNR == 1 {
    close(out)
    out = FILENAME
    sub(/1/,"2",out)
}
cnt[$3] == 3 {
    print $0 " > " out
}

。

$ awk -f ../tst.awk neu1 nep1 ret1
Chr        BP          Marker      MAF A1 A2 Direction   pValue    N > neu2
1 100000012 1:100000012:G:T 0.229925  T  G         + 0.650403 1594 > neu2
1 100000827 1:100000827:C:T 0.287014  T  C         + 0.955449 1594 > neu2
1 100002713 1:100002713:C:T 0.097867  T  C         - 0.290455 1594 > neu2
Chr        BP          Marker       MAF A1 A2 Direction    pValue    N > nep2
1 100000012 1:100000012:G:T 0.2300430  T  G         - 0.1420030 1641 > nep2
1 100000827 1:100000827:C:T 0.2867150  T  C         - 0.2045580 1641 > nep2
1 100002713 1:100002713:C:T 0.0975015  T  C         - 0.0555507 1641 > nep2
Chr        BP          Marker       MAF A1 A2 Direction   pValue    N > ret2
1 100000012 1:100000012:G:T 0.2322760  T  G         - 0.230383 1608 > ret2
1 100000827 1:100000827:C:T 0.2882460  T  C         - 0.120356 1608 > ret2
1 100002713 1:100002713:C:T 0.0982587  T  C         - 0.272936 1608 > ret2

更改print $0 " > " out为print > out完成测试后实际生成所需的输出文件。上面使用 GNU awk for ARGIND，如果您没有 GNU awk，则只需添加FNR==1 { ARGIND++ }为脚本的第一行。

Answer

$ cat ../tst.awk
ARGIND < 4 {
    if ( FNR == 1 ) {
        ARGV[ARGC] = FILENAME
        ARGC++
    }
    if ( !seen[ARGIND,$3]++ ) {
        cnt[$3]++
    }
    next
}
FNR == 1 {
    close(out)
    out = FILENAME
    sub(/1/,"2",out)
}
cnt[$3] == 3 {
    print $0 " > " out
}

。

$ awk -f ../tst.awk neu1 nep1 ret1
Chr        BP          Marker      MAF A1 A2 Direction   pValue    N > neu2
1 100000012 1:100000012:G:T 0.229925  T  G         + 0.650403 1594 > neu2
1 100000827 1:100000827:C:T 0.287014  T  C         + 0.955449 1594 > neu2
1 100002713 1:100002713:C:T 0.097867  T  C         - 0.290455 1594 > neu2
Chr        BP          Marker       MAF A1 A2 Direction    pValue    N > nep2
1 100000012 1:100000012:G:T 0.2300430  T  G         - 0.1420030 1641 > nep2
1 100000827 1:100000827:C:T 0.2867150  T  C         - 0.2045580 1641 > nep2
1 100002713 1:100002713:C:T 0.0975015  T  C         - 0.0555507 1641 > nep2
Chr        BP          Marker       MAF A1 A2 Direction   pValue    N > ret2
1 100000012 1:100000012:G:T 0.2322760  T  G         - 0.230383 1608 > ret2
1 100000827 1:100000827:C:T 0.2882460  T  C         - 0.120356 1608 > ret2
1 100002713 1:100002713:C:T 0.0982587  T  C         - 0.272936 1608 > ret2

更改print $0 " > " out为print > out完成测试后实际生成所需的输出文件。上面使用 GNU awk for ARGIND，如果您没有 GNU awk，则只需添加FNR==1 { ARGIND++ }为脚本的第一行。

Question 3

这确实是一个很好的问题。如果您确定文件按第三列升序排序，我建议使用与“SQL 世界”中的合并连接类似的方法。文件的排序必须使用相同的排序函数来完成，该函数用于比较 bash 中的字符串。下面的脚本需要 bash。应该通过以下方式调用它：

$ script file1 file2 file3

它将创建 3 个输出文件：file1.out file2.out file3.out。它们包含来自各自源文件的行，这些行在第三列中具有值，该列存在于所有输入文件中。具有此类值的所有行都位于“.out”文件中。

#!/bin/bash
Fil1="$1"
Fil2="$2"
Fil3="$3"
# Opening file descriptors  for reading
exec 10<"$Fil1"
exec 20<"$Fil2"
exec 30<"$Fil3"
# Open output files
exec 11>"$Fil1".out
exec 21>"$Fil2".out
exec 31>"$Fil3".out
# First line is copied without changes.
IFS= read -r -u 10 Line1
printf "%s\n" "$Line1" >&11 
IFS= read -r -u 20 Line2
printf "%s\n" "$Line2" >&21 
IFS= read -r -u 30 Line3
printf "%s\n" "$Line3" >&31 
# Prepare to walk through files, searching for identical values in the third column.
IFS= read -r -u 10 Line1
if [ $? -eq 0 ] ; then RC1=0; else RC1=1; fi
read -r F11 F12 F13 F1 <<<  "$Line1" 

IFS= read -r -u 20 Line2
if [ $? -eq 0 ] ; then RC2=0; else RC2=1; fi
read -r F21 F22 F23 F2 <<<  "$Line2" 

IFS= read -r -u 30 Line3
if [ $? -eq 0 ] ; then RC3=0; else RC3=1; fi
read -r F31 F32 F33 F3 <<<  "$Line3" 

while [ $RC1 -eq 0 ] && [ $RC2 -eq 0 ] && [ $RC3 -eq 0 ]
do
  while [ "$F23" \< "$F13" ]
  do
    IFS= read -r -u 20 Line2
    if [ $? -eq 0 ] ; then RC2=0; else RC2=1; break; fi
    read -r F21 F22 F23 F2 <<<  "$Line2" 
  done
  if [ $RC2 -ne 0 ]; then break; fi
  if [ "$F23" = "$F13" ]
  then
    FFF="$F23"  
    while [ "$F33" \< "$FFF" ]
    do
      IFS= read -r -u 30 Line3
      if [ $? -eq 0 ] ; then RC3=0; else RC3=1; break; fi
      read -r F31 F32 F33 F3 <<<  "$Line3" 
    done
    if [ $RC3 -ne 0 ]; then break; fi
    if [ "$F33" = "$FFF" ]
    then
      while [ "$F23" = "$FFF" ]
      do
        printf "%s\n" "$Line2" >&21 
        IFS= read -r -u 20 Line2
        if [ $? -eq 0 ] ; then RC2=0; else RC2=1; break; fi
        read -r F21 F22 F23 F2 <<<  "$Line2" 
      done
      while [ "$F13" = "$FFF" ]
      do
        printf "%s\n" "$Line1" >&11 
        IFS= read -r -u 10 Line1
        if [ $? -eq 0 ] ; then RC1=0; else RC1=1; break; fi
        read -r F11 F12 F13 F1 <<<  "$Line1" 
      done
      while [ "$F33" = "$FFF" ]
      do
        printf "%s\n" "$Line3" >&31 
        IFS= read -r -u 30 Line3
        if [ $? -eq 0 ] ; then RC3=0; else RC3=1; break; fi
        read -r F31 F32 F33 F3 <<<  "$Line3" 
      done
      if [ $RC1 -ne 0 ] || [ $RC2 -ne 0 ] || [ $RC3 -ne 0 ]; then break; fi
    fi
  fi  
  while [ "$F33" \> "$F23" ]
  do
    IFS= read -r -u 20 Line2
    if [ $? -eq 0 ] ; then RC2=0; else RC2=1; break; fi
    read -r F21 F22 F23 F2 <<<  "$Line2" 
  done
  while [ "$F23" \> "$F13" ]
  do
    IFS= read -r -u 10 Line1
    if [ $? -eq 0 ] ; then RC1=0; else RC1=1; break; fi
    read -r F11 F12 F13 F1 <<<  "$Line1" 
  done
done
# close files 
exec 10>&-
exec 11>&-
exec 20>&-
exec 21>&-

该脚本仅读取文件一次，并交替前进。因此，所有输入文件必须同时打开。这在 bash 中是可能的，但 shell 并不是进行此类处理的好工具。对于大型输入文件，通过用 C 语言编写，您可能会获得更好的结果。也可以更轻松地为代码提供更好的结构，并完全控制过程。

Answer

这确实是一个很好的问题。如果您确定文件按第三列升序排序，我建议使用与“SQL 世界”中的合并连接类似的方法。文件的排序必须使用相同的排序函数来完成，该函数用于比较 bash 中的字符串。下面的脚本需要 bash。应该通过以下方式调用它：

$ script file1 file2 file3

它将创建 3 个输出文件：file1.out file2.out file3.out。它们包含来自各自源文件的行，这些行在第三列中具有值，该列存在于所有输入文件中。具有此类值的所有行都位于“.out”文件中。

#!/bin/bash
Fil1="$1"
Fil2="$2"
Fil3="$3"
# Opening file descriptors  for reading
exec 10<"$Fil1"
exec 20<"$Fil2"
exec 30<"$Fil3"
# Open output files
exec 11>"$Fil1".out
exec 21>"$Fil2".out
exec 31>"$Fil3".out
# First line is copied without changes.
IFS= read -r -u 10 Line1
printf "%s\n" "$Line1" >&11 
IFS= read -r -u 20 Line2
printf "%s\n" "$Line2" >&21 
IFS= read -r -u 30 Line3
printf "%s\n" "$Line3" >&31 
# Prepare to walk through files, searching for identical values in the third column.
IFS= read -r -u 10 Line1
if [ $? -eq 0 ] ; then RC1=0; else RC1=1; fi
read -r F11 F12 F13 F1 <<<  "$Line1" 

IFS= read -r -u 20 Line2
if [ $? -eq 0 ] ; then RC2=0; else RC2=1; fi
read -r F21 F22 F23 F2 <<<  "$Line2" 

IFS= read -r -u 30 Line3
if [ $? -eq 0 ] ; then RC3=0; else RC3=1; fi
read -r F31 F32 F33 F3 <<<  "$Line3" 

while [ $RC1 -eq 0 ] && [ $RC2 -eq 0 ] && [ $RC3 -eq 0 ]
do
  while [ "$F23" \< "$F13" ]
  do
    IFS= read -r -u 20 Line2
    if [ $? -eq 0 ] ; then RC2=0; else RC2=1; break; fi
    read -r F21 F22 F23 F2 <<<  "$Line2" 
  done
  if [ $RC2 -ne 0 ]; then break; fi
  if [ "$F23" = "$F13" ]
  then
    FFF="$F23"  
    while [ "$F33" \< "$FFF" ]
    do
      IFS= read -r -u 30 Line3
      if [ $? -eq 0 ] ; then RC3=0; else RC3=1; break; fi
      read -r F31 F32 F33 F3 <<<  "$Line3" 
    done
    if [ $RC3 -ne 0 ]; then break; fi
    if [ "$F33" = "$FFF" ]
    then
      while [ "$F23" = "$FFF" ]
      do
        printf "%s\n" "$Line2" >&21 
        IFS= read -r -u 20 Line2
        if [ $? -eq 0 ] ; then RC2=0; else RC2=1; break; fi
        read -r F21 F22 F23 F2 <<<  "$Line2" 
      done
      while [ "$F13" = "$FFF" ]
      do
        printf "%s\n" "$Line1" >&11 
        IFS= read -r -u 10 Line1
        if [ $? -eq 0 ] ; then RC1=0; else RC1=1; break; fi
        read -r F11 F12 F13 F1 <<<  "$Line1" 
      done
      while [ "$F33" = "$FFF" ]
      do
        printf "%s\n" "$Line3" >&31 
        IFS= read -r -u 30 Line3
        if [ $? -eq 0 ] ; then RC3=0; else RC3=1; break; fi
        read -r F31 F32 F33 F3 <<<  "$Line3" 
      done
      if [ $RC1 -ne 0 ] || [ $RC2 -ne 0 ] || [ $RC3 -ne 0 ]; then break; fi
    fi
  fi  
  while [ "$F33" \> "$F23" ]
  do
    IFS= read -r -u 20 Line2
    if [ $? -eq 0 ] ; then RC2=0; else RC2=1; break; fi
    read -r F21 F22 F23 F2 <<<  "$Line2" 
  done
  while [ "$F23" \> "$F13" ]
  do
    IFS= read -r -u 10 Line1
    if [ $? -eq 0 ] ; then RC1=0; else RC1=1; break; fi
    read -r F11 F12 F13 F1 <<<  "$Line1" 
  done
done
# close files 
exec 10>&-
exec 11>&-
exec 20>&-
exec 21>&-

该脚本仅读取文件一次，并交替前进。因此，所有输入文件必须同时打开。这在 bash 中是可能的，但 shell 并不是进行此类处理的好工具。对于大型输入文件，通过用 C 语言编写，您可能会获得更好的结果。也可以更轻松地为代码提供更好的结构，并完全控制过程。

有没有一种方法可以根据一列提取 3 个文件之间共有的行？

答案1

答案2

答案3

相关内容