编辑:于 2019-02-16 进行了大量更新,以包含更多故障排除信息。
我已经使用 iPXE 和 iSCSI 环境多年,但这是我第一次尝试进行 iSCSI 启动,而 iPXE 在与 iSCSI 目标的对话时遇到了问题。
存储服务器
CentOS Linux release 7.6.1810 (Core)
Linux san1srvp01.********.net 3.10.0-957.5.1.el7.x86_64 #1 SMP Fri Feb 1 14:54:57 UTC 2019 x86_64 x86_64 x86_64 GNU/Linux
zfs-0.7.12-1.el7_6.x86_64
支持block
实例
Disk /dev/zpool1/jane: 8422 MB, 8422687232 bytes, 16450561 sectors
Units = sectors of 1 * 512 = 512 bytes
Sector size (logical/physical): 512 bytes / 512 bytes
I/O size (minimum/optimal): 512 bytes / 512 bytes
Disk label type: dos
Disk identifier: 0xa9a554b4
Device Boot Start End Blocks Id System
/dev/zpool1/jane1 63 80324 40131 12 Compaq diagnostics
/dev/zpool1/jane2 * 80325 16434494 8177085 7 HPFS/NTFS/exFAT
支持fileio
实例
Disk /zpool1/nas1/Media/c0d0.img: 8422 MB, 8422686720 bytes, 16450560 sectors
Units = sectors of 1 * 512 = 512 bytes
Sector size (logical/physical): 512 bytes / 512 bytes
I/O size (minimum/optimal): 512 bytes / 512 bytes
Disk label type: dos
Disk identifier: 0xa9a554b4
Device Boot Start End Blocks Id System
/zpool1/nas1/Media/c0d0.img1 63 80324 40131 12 Compaq diagnostics
/zpool1/nas1/Media/c0d0.img2 * 80325 16434494 8177085 7 HPFS/NTFS/exFAT
简化的 iSCSI 目标配置。这是使用 ZFS zvol 的块的示例,我还尝试了 fileio,其行为没有什么不同。
{
"fabric_modules": [
{
"discovery_enable_auth": true,
"discovery_password": "********************************",
"discovery_userid": "san1srvp01",
"name": "iscsi"
}
],
"storage_objects": [
{
"alua_tpgs": [
{
"alua_access_state": 0,
"alua_access_status": 0,
"alua_access_type": 3,
"alua_support_active_nonoptimized": 1,
"alua_support_active_optimized": 1,
"alua_support_offline": 1,
"alua_support_standby": 1,
"alua_support_transitioning": 1,
"alua_support_unavailable": 1,
"alua_write_metadata": 0,
"implicit_trans_secs": 0,
"name": "default_tg_pt_gp",
"nonop_delay_msecs": 100,
"preferred": 0,
"tg_pt_gp_id": 0,
"trans_delay_msecs": 0
}
],
"attributes": {
"block_size": 512,
"emulate_3pc": 1,
"emulate_caw": 1,
"emulate_dpo": 1,
"emulate_fua_read": 1,
"emulate_fua_write": 1,
"emulate_model_alias": 1,
"emulate_rest_reord": 0,
"emulate_tas": 1,
"emulate_tpu": 0,
"emulate_tpws": 0,
"emulate_ua_intlck_ctrl": 0,
"emulate_write_cache": 0,
"enforce_pr_isids": 1,
"force_pr_aptpl": 0,
"is_nonrot": 1,
"max_unmap_block_desc_count": 1,
"max_unmap_lba_count": 262144,
"max_write_same_len": 65535,
"optimal_sectors": 32768,
"pi_prot_format": 0,
"pi_prot_type": 0,
"queue_depth": 128,
"unmap_granularity": 16,
"unmap_granularity_alignment": 0,
"unmap_zeroes_data": 0
},
"dev": "/dev/zpool1/jane",
"name": "jane",
"plugin": "block",
"readonly": false,
"write_back": false,
"wwn": "8688850f-7200-48a0-ad32-0f4f9397a836"
}
],
"targets": [
{
"fabric": "iscsi",
"tpgs": [
{
"attributes": {
"authentication": 0,
"cache_dynamic_acls": 0,
"default_cmdsn_depth": 64,
"default_erl": 0,
"demo_mode_discovery": 1,
"demo_mode_write_protect": 1,
"fabric_prot_type": 0,
"generate_node_acls": 0,
"login_timeout": 15,
"netif_timeout": 2,
"prod_mode_write_protect": 0,
"t10_pi": 0,
"tpg_enabled_sendtargets": 1
},
"enable": true,
"luns": [
{
"alias": "414d07d6b4",
"alua_tg_pt_gp_name": "default_tg_pt_gp",
"index": 2,
"storage_object": "/backstores/block/jane"
}
],
"node_acls": [
{
"attributes": {
"dataout_timeout": 3,
"dataout_timeout_retries": 5,
"default_erl": 0,
"nopin_response_timeout": 30,
"nopin_timeout": 15,
"random_datain_pdu_offsets": 0,
"random_datain_seq_offsets": 0,
"random_r2t_offsets": 0
},
"chap_mutual_password": "****************",
"chap_mutual_userid": "san1srvp01",
"chap_password": "****************",
"chap_userid": "jane",
"mapped_luns": [
{
"alias": "c8ce872be3",
"index": 2,
"tpg_lun": 2,
"write_protect": false
}
],
"node_wwn": "iqn.1999-10.net.********:jane"
}
],
"parameters": {
"AuthMethod": "CHAP,None",
"DataDigest": "CRC32C,None",
"DataPDUInOrder": "Yes",
"DataSequenceInOrder": "Yes",
"DefaultTime2Retain": "20",
"DefaultTime2Wait": "2",
"ErrorRecoveryLevel": "0",
"FirstBurstLength": "65536",
"HeaderDigest": "CRC32C,None",
"IFMarkInt": "2048~65535",
"IFMarker": "No",
"ImmediateData": "Yes",
"InitialR2T": "Yes",
"MaxBurstLength": "262144",
"MaxConnections": "1",
"MaxOutstandingR2T": "1",
"MaxRecvDataSegmentLength": "8192",
"MaxXmitDataSegmentLength": "262144",
"OFMarkInt": "2048~65535",
"OFMarker": "No",
"TargetAlias": "LIO Target"
},
"portals": [
{
"ip_address": "192.168.40.1",
"iser": false,
"offload": false,
"port": 3260
}
],
"tag": 1
}
],
"wwn": "iqn.1999-10.net.********:san1srvp01"
}
]
}
PXE/iPXE/TFTP/HTTP 服务器
CentOS release 6.10 (Final)
Linux sy1srvp01.********.net 2.6.32-754.10.1.el6.i686 #1 SMP Tue Jan 15 17:33:10 UTC 2019 i686 i686 i386 GNU/Linux
tftp-0.49-8.el6.i686
iPXE 实现将首先按顺序交给与主机名、uuid 或 mac 匹配的脚本。这是此 mac 的单独 iPXE 启动脚本mac-0007e90feaf5.ipxe
set username jane
set password ****************
set reverse-username san1srvp01
set reverse-password ****************
set initiator-iqn iqn.1999-10.net.********:jane
sanboot iscsi:192.168.40.1::::iqn.1999-10.net.********:san1srvp01
发起者
Compaq ML370 (Generation 0)
BIOS P17 (12/18/2002)
Processor 866/133 Mhz with 256k Cache
RAM 1 GB
Intel Boot Agent GE v1.2.22
PXE -> iPXE 链式加载
PXE 2.1 Build 084 (WfM 2.0), RPL V1.25
PX->EB: PXE! at 9CC2:0070, entry point at 9CC2:0106
UNDI code segment 9CC2:0000, data segment 969B:0000 (602-628kB)
UNDI device is PCI 00:06.0, type DIX+802.3
602kB free base memory after PXE unload
iPXE 1.0.0+ -- Open Source Network Boot Firmware -- http://ipxe.org
Features: DNS HTTP iSCSI TFTP AoE ELF MBOOT PXE bzImage Menu PXEXT
我使用网络跟踪来跟踪 iPXE sanboot
iSCSI 启动过程。从高层次来看,它是:
- 登录命令 (CHAP)
- 测试单元就绪
- 读取容量(10)
- 读取(10) <- 失败
首先,Read Capacity(10)
返回 LBA 的意外值 63,而不是 16450560。然后它尝试在 LBA 64 处执行 Read(10),结果不出所料地失败了Logical Block Address Out Of Range
。测试表明这是由 iPXE 和 LIO 之间的特定交互引起的,但确切原因尚不清楚。
读取容量(10) - 请求
Frame 27: 114 bytes on wire (912 bits), 114 bytes captured (912 bits)
Ethernet II, Src: Intel_0f:ea:f5 (00:07:e9:0f:ea:f5), Dst: SuperMic_6c:a9:82 (00:25:90:6c:a9:82)
Internet Protocol Version 4, Src: 192.168.4.13, Dst: 192.168.40.1
Transmission Control Protocol, Src Port: cifs (3020), Dst Port: iscsi-target (3260), Seq: 773, Ack: 637, Len: 48
iSCSI (SCSI Command)
Flags: 0xc1, F, R, Attr: Simple
SCSI CDB Read Capacity(10)
[LUN: 0x0000]
[Command Set:Direct Access Device (0x00) (Using default commandset)]
[Response in: 29]
Opcode: Read Capacity(10) (0x25)
Control: 0x00
读取容量(10) - 响应
Frame 29: 74 bytes on wire (592 bits), 74 bytes captured (592 bits)
Ethernet II, Src: SuperMic_6c:a9:82 (00:25:90:6c:a9:82), Dst: Intel_0f:ea:f5 (00:07:e9:0f:ea:f5)
Internet Protocol Version 4, Src: 192.168.40.1, Dst: 192.168.4.13
Transmission Control Protocol, Src Port: iscsi-target (3260), Dst Port: cifs (3020), Seq: 685, Ack: 821, Len: 8
[2 Reassembled TCP Segments (56 bytes): #28(48), #29(8)]
iSCSI (SCSI Data In)
SCSI Payload (Read Capacity(10) Response Data)
[LUN: 0x0000]
[Command Set:Direct Access Device (0x00) (Using default commandset)]
[SBC Opcode: Read Capacity(10) (0x25)]
[Request in: 27]
[Response in: 29]
LBA: 63 (0 MB)
Block size in bytes: 512
SCSI Response (Read Capacity(10))
[LUN: 0x0000]
[Command Set:Direct Access Device (0x00) (Using default commandset)]
[SBC Opcode: Read Capacity(10) (0x25)]
[Request in: 27]
[Time from request: 0.000252000 seconds]
[Status: Good (0x00)]
阅读(10) - 请求
Frame 32: 114 bytes on wire (912 bits), 114 bytes captured (912 bits)
Ethernet II, Src: Intel_0f:ea:f5 (00:07:e9:0f:ea:f5), Dst: SuperMic_6c:a9:82 (00:25:90:6c:a9:82)
Internet Protocol Version 4, Src: 192.168.4.13, Dst: 192.168.40.1
Transmission Control Protocol, Src Port: cifs (3020), Dst Port: iscsi-target (3260), Seq: 821, Ack: 693, Len: 48
iSCSI (SCSI Command)
Flags: 0xc1, F, R, Attr: Simple
SCSI CDB Read(10)
[LUN: 0x0000]
[Command Set:Direct Access Device (0x00) (Using default commandset)]
[Response in: 33]
Opcode: Read(10) (0x28)
Flags: 0x00
Logical Block Address (LBA): 64
...0 0000 = Group: 0x00
Transfer Length: 4
Control: 0x00
阅读(10) - 响应
Frame 33: 214 bytes on wire (1712 bits), 214 bytes captured (1712 bits)
Ethernet II, Src: SuperMic_6c:a9:82 (00:25:90:6c:a9:82), Dst: Intel_0f:ea:f5 (00:07:e9:0f:ea:f5)
Internet Protocol Version 4, Src: 192.168.40.1, Dst: 192.168.4.13
Transmission Control Protocol, Src Port: iscsi-target (3260), Dst Port: cifs (3020), Seq: 693, Ack: 869, Len: 148
iSCSI (SCSI Response)
Flags: 0x80
SCSI: SNS Info
[LUN: 0x0000]
.111 0000 = SNS Error Type: Current Error (0x70)
Valid: 112
0... .... = Filemark: False
.0.. .... = EOM: False
..0. .... = ILI: False
.... 0101 = Sense Key: Illegal Request (0x5)
Sense Info: 0x00000000
Additional Sense Length: 10
Command-Specific Information: 00000000
Additional Sense Code+Qualifier: Logical Block Address Out Of Range (0x2100)
Field Replaceable Unit Code: 0x00
0... .... = SKSV: False
.000 0000 0000 0000 0000 0000 = Sense Key Specific: 0x000000
iPXE 对控制台的响应
Could not open SAN device: Input/output error (http://ipxe.org/1d704039
Could not boot image: Input/output error (http://ipxe.org/1d704039
在 iSCSI 目标上记录的消息
Feb 13 09:17:41 san1srvp01 kernel: cmd exceeds last lba 64 (lba 64, sectors 4)
测试和故障排除
- 尝试使用此 iSCSI LUN 启动 VM 时观察到相同的行为,排除物理机及其网卡。
- 使用本机 Linux 启动器挂载时,iSCSI 设备行为正常,并且该设备最初通过 iSCSI 挂载使用 dd 将映像文件复制到该设备。
- 我创建了一个自定义的 iPXE 版本,强制
Read Capacity(16)
和Read(16)
,但无济于事。 - 我发现了一个有记录的类似行为实例,该行为被确定为由阶段期间提供(或未提供)的操作参数导致的
Login
。作为回应,我创建了一个自定义版本,其参数和参数值与使用本机 Linux 启动器进行工作安装期间观察到的参数和参数值相同,但无济于事。 - 我尝试将 fileio 备用存储映像从 ZFS 移动到 xfs 卷,但无济于事。
- 我尝试过使用块设备初始化为零的 iPXE sanboot,但无济于事。这表明问题与分区或块设备内容无关。
问题
- 有人能证明有与此相当的工作设置吗?
- 有人知道这个设置到底存在什么问题吗?
- 有谁知道是什么原因导致 LIO 出现这种行为?
- 最难的问题是,有人知道如何解决这个问题吗?
—短暂性脑缺血发作
答案1
该卷是存储服务器上的 LUN 2,iPXE 需要明确处理 LUN 2。存储服务器甚至没有 LUN 0,因此不清楚为什么成功Test Unit Ready
了。有点尴尬,因为这是件如此简单的事情。