123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475 |
- import argparse
- import pandas as pd
- def parse_attributes(attributes_str):
- """ 解析 GTF 文件中的属性字段 """
- attributes = {}
- for attr in attributes_str.split(';'):
- attr = attr.strip()
- if attr:
- parts = attr.split(' ', 1)
- if len(parts) == 2:
- key, value = parts
- attributes[key] = value.strip('"')
- return attributes
- def read_gtf(gtf_file):
- """ 从 GTF 文件读取数据 """
- data = []
- with open(gtf_file, 'r') as file:
- for line in file:
- if line.startswith('#'):
- continue
- fields = line.strip().split('\t')
- attributes = parse_attributes(fields[8])
- data.append({
- 'chrom': fields[0],
- 'source': fields[1],
- 'feature': fields[2],
- 'start': int(fields[3]),
- 'end': int(fields[4]),
- 'score': fields[5],
- 'strand': fields[6],
- 'frame': fields[7],
- 'attributes': fields[8],
- 'transcript_id': attributes.get('transcript_id', ''),
- 'gene_id': attributes.get('gene_id', ''),
- 'line': line.strip() # 保存整行
- })
- return pd.DataFrame(data)
- def longest_transcripts(df):
- """ 找到每个基因的最长转录本 """
- return df[df['feature'] == 'transcript'].sort_values(['gene_id', 'end'], ascending=[True, False]).drop_duplicates('gene_id')
- def to_bed6(df):
- """ 将数据转换为 BED6 格式 """
- df['score'] = "."
- df['name'] = df['gene_id']
- return df[['chrom', 'start', 'end', 'name', 'score', 'strand']]
- def write_filtered_gtf(df, longest_transcripts, gtf_out_file):
- """ 将最长转录本的相关行和对应基因的行写入新的 GTF 文件 """
- longest_transcript_ids = set(longest_transcripts['transcript_id'])
- with open(gtf_out_file, 'w') as f:
- for _, row in df.iterrows():
- if row['feature'] == 'gene' or row['transcript_id'] in longest_transcript_ids:
- f.write(row['line'] + '\n')
- def main():
- parser = argparse.ArgumentParser(description="从 GTF 文件提取最长转录本并保存为 BED6 和 GTF 格式")
- parser.add_argument("gtf_file", help="输入的 GTF 文件路径")
- parser.add_argument("bed_file", help="输出的 BED6 格式文件路径")
- parser.add_argument("gtf_out_file", help="输出的 GTF 格式文件路径")
- args = parser.parse_args()
- df = read_gtf(args.gtf_file)
- longest_df = longest_transcripts(df)
- bed6_df = to_bed6(longest_df)
- bed6_df.to_csv(args.bed_file, sep='\t', index=False, header=False)
- write_filtered_gtf(df, longest_df, args.gtf_out_file)
- if __name__ == "__main__":
- main()
|