Wednesday, February 7, 2024

[SOLVED] Merging multiple files with two common columns, and replace the blank to 0

Issue

I extremely appreciate if anyone could me help to merge multiple files (up to 8) with two common columns ($1$2). I want to get all values of $3 and replace the blank with 0. Here are the samples from 4 files

File1:

chr1 111001 234
chr2 22099  108

File2:

chr1 111001 42
chr1 430229 267

File3:

chr1 111001 92
chr5 663800 311

File4:

chr1 111001 129
chr2 22099  442

Desired output

chr1 111001 234 42 92 129
chr1 430229  0  267 0  0
chr2 22099  108 0  0  442
chr5 663800  0  0 311  0

I tried

awk '{ a[$1 OFS $2 FS] = a[$1 OFS $2 FS] ( a[$1 OFS $2 FS] == "" ? "" : OFS) $3 }END{ for (i in a){print i,"0",a[i]} }' OFS="\t"  file1.txt file2.txt file3.txt file4.txt | sort -k1

output

chr1    111001  0   234 42  92  129
chr1    430229  0   267
chr2    22099   0   108 442
chr5    663800  0   311

Thank very much in advance


Solution

One more variant, could you please try following, written and teste with shown samples.

awk '
{
  if(!a[FILENAME]++){
     file[++count]=FILENAME
  }
  b[$1 OFS $2 OFS FILENAME]=$NF
  c[$1 OFS $2]++
  if(!d[$1 OFS $2]++){
    e[++count1]=$1 OFS $2
  }
}
END{
  for(i=1;i<=length(c);i++){
    printf("%s ",e[i])
    for(j=1;j<=count;j++){
      printf("%s %s",(b[e[i] OFS file[j]]!=""?b[e[i] OFS file[j]]:0),j==count?ORS:OFS)
    }
  }
}
' file{1..4} | sort -k1

Output will be as follows.

chr1 111001 234  42  92  129
chr1 430229 0  267  0  0
chr2 22099 108  0  0  442
chr5 663800 0  0  311  0

Explanation: Adding detailed explanation for above.

awk '                                        ##Starting awk program from here.
{
  if(!a[FILENAME]++){                        ##Checking condition if FILENAME is present in a then do following.
     file[++count]=FILENAME                  ##Creating file with index of count and value is current file name.
  }
  b[$1 OFS $2 OFS FILENAME]=$NF              ##Creating array b with index of 1st 2nd and filename and which has value as last field.
  c[$1 OFS $2]++                             ##Creating array c with index of 1st and 2nd field and keep increasing its value with 1.
  if(!d[$1 OFS $2]++){                       ##Checking condition if 1st and 2nd field are NOT present in d then do following.
    e[++count1]=$1 OFS $2                    ##Creating e with index of count1 with increasing value of 1 and which has first and second fields here.
  }
}
END{                                         ##Starting END block of this awk program from here.
  for(i=1;i<=length(c);i++){                 ##Starting for loop which runs from i=1 to till length of c here.
    printf("%s ",e[i])                       ##Printing value of array e with index i here.
    for(j=1;j<=count;j++){                   ##Starting for loop till value of count here.
      printf("%s %s",(b[e[i] OFS file[j]]!=""?b[e[i] OFS file[j]]:0),j==count?ORS:OFS)   ##Printing value of b with index of e[i] OFS file[j] if it present then print else print 0, print new line if j==count or print space.
    }
  }
}
' file{1..4} | sort -k1                      ##Mentioning Input_files 1 to 4 here and sorting output with 1st field here.


EDIT: As per GREAT regex GURU @anubhava sir's comments adding solution with ARGC and ARGV with GNU awk.

awk '
{
  b[$1 OFS $2 OFS FILENAME]=$NF
  c[$1 OFS $2]++
  if(!d[$1 OFS $2]++){
    e[++count1]=$1 OFS $2
  }
}
END{
  count=(ARGC-1)
  for(i=1;i<=length(c);i++){
    printf("%s ",e[i])
    for(j=1;j<=(ARGC-1);j++){
      printf("%s %s",(b[e[i] OFS ARGV[j]]!=""?b[e[i] OFS ARGV[j]]:0),j==count?ORS:OFS)
    }
  }
}
' file{1..4} | sort -k1


Answered By - RavinderSingh13
Answer Checked By - Terry (WPSolving Volunteer)