计算xpehh选择信号

xpehh (Cross Population Extended Haplotype Homozygosity) 是一种用于选择信号检测的统计方法,用于寻找遗传变异与自然选择之间的关联。它基于单核苷酸多态性(SNP)在不同群体之间的扩展单倍型同质性差异。通过计算目标群体与参考群体之间的扩展单倍型同质性差异,并进行统计检验,xpehh 方法可以帮助检测到与自然选择相关的遗传变异。这种方法在遗传学和人类进化研究中得到了广泛应用。

xpehh 的选择信号原理如下:

  1. 扩展单倍型同质性:xpehh 方法使用单倍型信息来评估一个给定区域内的扩展单倍型同质性。这是指在某个区域内,多个连续的SNP共同遗传的单倍型。
  2. 不同群体间的比较:xpehh 方法将目标群体与参考群体进行比较。通常,目标群体是受选择的种群,而参考群体是未受选择的种群。通过比较两个群体之间的扩展单倍型同质性差异,可以检测到选择信号。
  3. 基于单倍型频率的计算:xpehh 方法基于单倍型频率来计算扩展单倍型同质性。它通过比较目标群体和参考群体中特定单倍型的频率差异来确定选择信号。如果某个单倍型在目标群体中频率较高,而在参考群体中频率较低,可能表明该区域存在选择。
  4. 统计检验:xpehh 方法使用统计检验来评估扩展单倍型同质性差异的显著性。常用的统计检验方法包括 Z 分数、标准化扩展单倍型同质性(XP-EHH)和正态近似。

计算

1.群体较少的情况

#分染色体填充数据
1
2
3
4
5
6
for i in {1..27};do

java -Xms500g -Xmx500g -jar /mnt/d/lx/soft/beagle/beagle.22Jul22.46e.jar nthreads=110 gt=/mnt/d/lx/data/chr/chr$i.recode.vcf out=phasechr$i

done

# XP-EHH脚本
#调用,给出文件路径
1
2
bash selscan.sh -v /mnt/d/lx/data/vcf323/phase323vcf.vcf.gz -r /mnt/d/lx/data/grouph/group19.txt -t /mnt/d/lx/data/grouph/other.txt -w 150000 -s 75000 -T 100 -chr 27 -o new

#编写sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/bin/bash
#create by lx
selscan="/home/liuxiao/soft/selscan/linux/selscan"
norm='/home/liuxiao/soft/selscan/linux/norm'
function usage() {
echo "Usage: bash $0 --vcf <vcf> --ref <ref> --tag <tag> --win <winsize> --thread <thread> --output <outprefix>"
echo "required options"
echo "-v|--vcf vcf file"
echo "-r|--ref ref sample per row per ID"
echo "-t|--tag tag sample per row per ID"
echo "-w|--win winsize in xpehh, default 50000"
echo "-s|--step stepsize in xpehh, defult 50000"
echo "-T|--thread threads, default 10"
echo "-c|--chr 最大染色体号,决定你的vcf文件分多少个染色体文件"
echo "-o|--output 输出文件前缀"
exit 1;
}

vcf=""
ref=""
tag=""
win="50000"
step="50000"
thread="50"
chr=""
output=""
while [[ $# -gt 0 ]];
do
case "$1" in
-v|--vcf )
vcf=$2 ; shift 2 ;;
-r|--ref )
ref=$2 ; shift 2 ;;
-t|--tag )
tag=$2 ; shift 2 ;;
-w|--win )
win=$2 ; shift 2 ;;
-s|--step )
step=$2 ; shift 2 ;;
-T|--thread )
thread=$2 ; shift 2 ;;
-c|--chr )
chr=$2 ; shift 2 ;;
-o|--output )
output=$2 ; shift 2 ;;
*) echo "输入参数不对哦!" >&2
usage
shift
esac
done
if [ -z $vcf ] || [ -z $ref ] || [ -z $tag ] || [ -z $output ]; then
echo "检查一下这几个参数输了没 --vcf --ref --tag --output !" >&2
usage
fi
function main() {
mkdir XP-EHH.progress
\# extract sample
$vcftools --gzvcf $vcf --keep $ref --recode --recode-INFO-all --out ./XP-EHH.progress/01.ref
$vcftools --gzvcf $vcf --keep $tag --recode --recode-INFO-all --out ./XP-EHH.progress/01.tag
cd XP-EHH.progress
for ((k=1; k<=$chr; k++));
do
\# splite chr for ref and tag
$vcftools --vcf 01.ref.recode.vcf --recode --recode-INFO-all --chr ${k} --out ref.chr${k}
$vcftools --vcf 01.tag.recode.vcf --recode --recode-INFO-all --chr ${k} --out tag.chr${k}
\# calculate map distance
$vcftools --vcf ref.chr${k}.recode.vcf --plink --out chr${k}.MT
awk 'BEGIN{OFS=" "} {print $1,".",$4,$4}' chr${k}.MT.map > chr${k}.MT.map.distance
done
for ((k=1; k<=$chr; k++));
do
\# XP-EHH
$selscan --xpehh --vcf tag.chr${k}.recode.vcf --vcf-ref ref.chr${k}.recode.vcf --map chr${k}.MT.map.distance --threads $thread --out chr${k}.ref_tag
\# norm
$norm --xpehh --files chr${k}.ref_tag.xpehh.out --bp-win --winsize $win
\# add win and step
python ../XPEHH_Win_step.py --file chr${k}.ref_tag.xpehh.out.norm -c $k --window $win --step $step
done
cat {1.."$chr"}.XPEHH > ../${output}.XPEHH
}
main
#XPEHH_Win_step.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import pandas as pd
import click

def load_data(file):
data = pd.read_csv(file, delimiter="\t|\s+",
engine='python')
return data

def results(data, step_size, window_size):
result = []
chromosome_length = max(data['pos'])

for BIN_START in range(1, chromosome_length, step_size):
BIN_END = BIN_START - 1 + window_size
if BIN_START + window_size > chromosome_length:
break

normxpehh_vals = []
for _, row in data[(data['pos'] >= BIN_START) & (data['pos'] < BIN_END)].iterrows():
if not pd.isna(row['pos']):
normxpehh_vals.append(row['normxpehh'])

# 计算 normxpehh 的平均值并保留4位小数, 统计区间SNP数量
avg_normxpehh = 0
nvar = 0
if len(normxpehh_vals) > 0:
avg_normxpehh = round(sum(normxpehh_vals) / len(normxpehh_vals), 4)
nvar = len(normxpehh_vals)
result.append([BIN_START, BIN_END,
avg_normxpehh, nvar])
return result

@click.command()
@click.option('-f','--file', help='xpehh.out.norm文件,norm后的位点文件,不是区间文件!!', required=True)
@click.option('-c','--chromosome', help='染色体号,因为是分染色体做的', required=True)
@click.option('-w','--window', help='窗口大小', type=int, default=50000)
@click.option('-s','--step', help='步长大小', type=int, default=50000)
def main(file, chromosome, window, step):
data = load_data(file)
window_size = window
step_size = step
out = results(data, step_size, window_size)

# 创建一个 DataFrame 对象来保存结果,并使用 to_csv 方法将其写入文件中
result_df = pd.DataFrame(out, columns=["BIN_START", "BIN_END",
"avg_normxpehh", "nvar"])
result_df.loc[:, 'CHROM'] = chromosome

if chromosome == 1:
result_df[["CHROM", "BIN_START", "BIN_END",
"nvar", "avg_normxpehh"]].to_csv(f'{chromosome}.XPEHH', sep='\t',
index=False)
else:
result_df[["CHROM", "BIN_START", "BIN_END",
"nvar", "avg_normxpehh"]].to_csv(f'{chromosome}.XPEHH', sep='\t',
index=False, header=False)

if __name__ == '__main__':
main()

2多个群体计算

#自用,给出文件夹路径
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/bin/bash
#create by lx
selscan="/home/liuxiao/soft/selscan/linux/selscan"
norm="/home/liuxiao/soft/selscan/linux/norm"
vcftools='/home/liuxiao/miniconda3/bin/vcftools'
function usage() {
echo "Usage: bash $0 --pi <vcf_folder> --win <winsize> --step <stepsize> --thread <thread> --output <outprefix>"
echo "required options"
echo "-pi|--pwdinput folder containing VCF files"
echo "-w|--win winsize in xpehh, default 50000"
echo "-s|--step stepsize in xpehh, default 50000"
echo "-t|--thread threads, default 10"
echo "-c|--chr "
echo "-po|--pwdout output file prefix"
exit 1;
}

vcf_folder=""
tag=""
win="150000"
step="75000"
thread="60"
chr=""
pwd=""

# bash *sh -pi /home/liuxiao/1254/result/vcf/ -w 150000 -s 75000 -c 27 -t 60 -po /home/liuxiao/1254/result/selscan/

while [[ $# -gt 0 ]]; do
case "$1" in
-pi|--pwdinput )
vcf_folder=$2 ; shift 2 ;;
-w|--win )
win=$2 ; shift 2 ;;
-s|--step )
step=$2 ; shift 2 ;;
-t|--thread )
thread=$2 ; shift 2 ;;
-c|--chr )
chr=$2 ; shift 2 ;;
-po|--pwdout )
pwd=$2 ; shift 2 ;;
*) echo "Invalid input parameter!" >&2
usage
shift
esac
done

if [ -z $vcf_folder ] || [ -z $pwd ]; then
echo "Please check if you have provided the required parameters: --pwdinput, and --pwdoutput!" >&2
usage
fi

function main() {
for vcf_file in $vcf_folder/*.vcf.gz;
do
vcf_filename=$(basename $vcf_file)
vcf_filename_no_ext="${vcf_filename%.vcf.gz}"
mkdir $pwd${vcf_filename_no_ext}
cd $pwd${vcf_filename_no_ext}

for ((k=1; k<=$chr; k++));
do
# splite chr for ref and tag
$vcftools --gzvcf $vcf_folder${vcf_filename_no_ext}.vcf.gz --recode --recode-INFO-all --chr ${k} --out ${vcf_filename_no_ext}.chr${k} &
done
wait
for ((k=1; k<=$chr; k++));
do
$selscan --xpehh --vcf /home/liuxiao/1254/result/selscan/hu/Hu.chr${k}.recode.vcf --vcf-ref ${vcf_filename_no_ext}.chr${k}.recode.vcf --map /home/liuxiao/1254/result/vcf/map/chr${k}.MT.map.distance --threads $thread --out chr${k}.${vcf_filename_no_ext}

$norm --xpehh --files chr${k}.${vcf_filename_no_ext}.xpehh.out --bp-win --winsize $win

python /home/liuxiao/1254/result/selscan/XPEHH_Win_step.py --file chr${k}.${vcf_filename_no_ext}.xpehh.out.norm -c $k --window $win --step $step
done
cat *.XPEHH > /home/liuxiao/1254/result/selscan/${vcf_filename_no_ext}.XPEHH
done
}

main
python脚本同上