Source: lib/cea/mp4_cea_parser.js

  1. /*! @license
  2. * Shaka Player
  3. * Copyright 2016 Google LLC
  4. * SPDX-License-Identifier: Apache-2.0
  5. */
  6. goog.provide('shaka.cea.Mp4CeaParser');
  7. goog.require('goog.asserts');
  8. goog.require('shaka.cea.CeaUtils');
  9. goog.require('shaka.cea.SeiProcessor');
  10. goog.require('shaka.log');
  11. goog.require('shaka.media.ClosedCaptionParser');
  12. goog.require('shaka.util.DataViewReader');
  13. goog.require('shaka.util.Error');
  14. goog.require('shaka.util.Mp4Parser');
  15. goog.require('shaka.util.Mp4BoxParsers');
  16. /**
  17. * MPEG4 stream parser used for extracting 708 closed captions data.
  18. * @implements {shaka.extern.ICeaParser}
  19. * @export
  20. */
  21. shaka.cea.Mp4CeaParser = class {
  22. /** */
  23. constructor() {
  24. /**
  25. * SEI data processor.
  26. * @private
  27. * @const {!shaka.cea.SeiProcessor}
  28. */
  29. this.seiProcessor_ = new shaka.cea.SeiProcessor();
  30. /**
  31. * Map of track id to corresponding timescale.
  32. * @private {!Map<number, number>}
  33. */
  34. this.trackIdToTimescale_ = new Map();
  35. /**
  36. * Default sample duration, as specified by the TREX box.
  37. * @private {number}
  38. */
  39. this.defaultSampleDuration_ = 0;
  40. /**
  41. * Default sample size, as specified by the TREX box.
  42. * @private {number}
  43. */
  44. this.defaultSampleSize_ = 0;
  45. /**
  46. * @private {shaka.cea.Mp4CeaParser.BitstreamFormat}
  47. */
  48. this.bitstreamFormat_ = shaka.cea.Mp4CeaParser.BitstreamFormat.UNKNOWN;
  49. }
  50. /**
  51. * Parses the init segment. Gets Default Sample Duration and Size from the
  52. * TREX box, and constructs a map of Track IDs to timescales. Each TRAK box
  53. * contains a track header (TKHD) containing track ID, and a media header box
  54. * (MDHD) containing the timescale for the track
  55. * @override
  56. */
  57. init(initSegment) {
  58. const Mp4Parser = shaka.util.Mp4Parser;
  59. const BitstreamFormat = shaka.cea.Mp4CeaParser.BitstreamFormat;
  60. const trackIds = [];
  61. const timescales = [];
  62. const codecBoxParser = (box) => this.setBitstreamFormat_(box.name);
  63. new Mp4Parser()
  64. .box('moov', Mp4Parser.children)
  65. .box('mvex', Mp4Parser.children)
  66. .fullBox('trex', (box) => {
  67. const parsedTREXBox = shaka.util.Mp4BoxParsers.parseTREX(
  68. box.reader);
  69. this.defaultSampleDuration_ = parsedTREXBox.defaultSampleDuration;
  70. this.defaultSampleSize_ = parsedTREXBox.defaultSampleSize;
  71. })
  72. .box('trak', Mp4Parser.children)
  73. .fullBox('tkhd', (box) => {
  74. goog.asserts.assert(
  75. box.version != null,
  76. 'TKHD is a full box and should have a valid version.');
  77. const parsedTKHDBox = shaka.util.Mp4BoxParsers.parseTKHD(
  78. box.reader, box.version);
  79. trackIds.push(parsedTKHDBox.trackId);
  80. })
  81. .box('mdia', Mp4Parser.children)
  82. .fullBox('mdhd', (box) => {
  83. goog.asserts.assert(
  84. box.version != null,
  85. 'MDHD is a full box and should have a valid version.');
  86. const parsedMDHDBox = shaka.util.Mp4BoxParsers.parseMDHD(
  87. box.reader, box.version);
  88. timescales.push(parsedMDHDBox.timescale);
  89. })
  90. .box('minf', Mp4Parser.children)
  91. .box('stbl', Mp4Parser.children)
  92. .fullBox('stsd', Mp4Parser.sampleDescription)
  93. // These are the various boxes that signal a codec.
  94. .box('avc1', codecBoxParser)
  95. .box('avc3', codecBoxParser)
  96. .box('dvav', codecBoxParser)
  97. .box('dva1', codecBoxParser)
  98. .box('hev1', codecBoxParser)
  99. .box('hvc1', codecBoxParser)
  100. .box('dvh1', codecBoxParser)
  101. .box('dvhe', codecBoxParser)
  102. .box('vvc1', codecBoxParser)
  103. .box('vvi1', codecBoxParser)
  104. .box('dvc1', codecBoxParser)
  105. .box('dvi1', codecBoxParser)
  106. // This signals an encrypted sample, which we can go inside of to find
  107. // the codec used.
  108. .box('encv', Mp4Parser.visualSampleEntry)
  109. .box('sinf', Mp4Parser.children)
  110. .box('frma', (box) => {
  111. const {codec} = shaka.util.Mp4BoxParsers.parseFRMA(box.reader);
  112. this.setBitstreamFormat_(codec);
  113. })
  114. .parse(initSegment, /* partialOkay= */ true);
  115. // At least one track should exist, and each track should have a
  116. // corresponding Id in TKHD box, and timescale in its MDHD box
  117. if (!trackIds.length|| !timescales.length ||
  118. trackIds.length != timescales.length) {
  119. throw new shaka.util.Error(
  120. shaka.util.Error.Severity.CRITICAL,
  121. shaka.util.Error.Category.TEXT,
  122. shaka.util.Error.Code.INVALID_MP4_CEA);
  123. }
  124. if (this.bitstreamFormat_ == BitstreamFormat.UNKNOWN) {
  125. shaka.log.alwaysWarn(
  126. 'Unable to determine bitstream format for CEA parsing!');
  127. }
  128. // Populate the map from track Id to timescale
  129. trackIds.forEach((trackId, idx) => {
  130. this.trackIdToTimescale_.set(trackId, timescales[idx]);
  131. });
  132. }
  133. /**
  134. * Parses each video segment. In fragmented MP4s, MOOF and MDAT come in
  135. * pairs. The following logic gets the necessary info from MOOFs to parse
  136. * MDATs (base media decode time, sample sizes/offsets/durations, etc),
  137. * and then parses the MDAT boxes for CEA-708 packets using this information.
  138. * CEA-708 packets are returned in the callback.
  139. * @override
  140. */
  141. parse(mediaSegment) {
  142. const Mp4Parser = shaka.util.Mp4Parser;
  143. const BitstreamFormat = shaka.cea.Mp4CeaParser.BitstreamFormat;
  144. if (this.bitstreamFormat_ == BitstreamFormat.UNKNOWN) {
  145. // We don't know how to extract SEI from this.
  146. return [];
  147. }
  148. /** @type {!Array<!shaka.extern.ICeaParser.CaptionPacket>} **/
  149. const captionPackets = [];
  150. let moofOffset = 0;
  151. /** @type {!Array<!shaka.cea.Mp4CeaParser.ParsedTRAF>} */
  152. let parsedTRAFs = [];
  153. new Mp4Parser()
  154. .box('moof', (box) => {
  155. moofOffset = box.start;
  156. // traf box parsing is reset on each moof.
  157. parsedTRAFs = [];
  158. Mp4Parser.children(box);
  159. })
  160. .box('traf', (box) => {
  161. parsedTRAFs.push({
  162. baseMediaDecodeTime: null,
  163. defaultSampleDuration: this.defaultSampleDuration_,
  164. defaultSampleSize: this.defaultSampleSize_,
  165. parsedTRUNs: [],
  166. timescale: shaka.cea.CeaUtils.DEFAULT_TIMESCALE_VALUE,
  167. });
  168. Mp4Parser.children(box);
  169. })
  170. .fullBox('trun', (box) => {
  171. goog.asserts.assert(
  172. box.version != null && box.flags != null,
  173. 'TRUN is a full box and should have a valid version & flags.');
  174. const lastTRAF = parsedTRAFs[parsedTRAFs.length - 1];
  175. const parsedTRUN = shaka.util.Mp4BoxParsers.parseTRUN(
  176. box.reader, box.version, box.flags);
  177. lastTRAF.parsedTRUNs.push(parsedTRUN);
  178. })
  179. .fullBox('tfhd', (box) => {
  180. goog.asserts.assert(
  181. box.flags != null,
  182. 'TFHD is a full box and should have valid flags.');
  183. const lastTRAF = parsedTRAFs[parsedTRAFs.length - 1];
  184. const parsedTFHD = shaka.util.Mp4BoxParsers.parseTFHD(
  185. box.reader, box.flags);
  186. // If specified, defaultSampleDuration and defaultSampleSize
  187. // override the ones specified in the TREX box
  188. lastTRAF.defaultSampleDuration = parsedTFHD.defaultSampleDuration ||
  189. this.defaultSampleDuration_;
  190. lastTRAF.defaultSampleSize = parsedTFHD.defaultSampleSize ||
  191. this.defaultSampleSize_;
  192. const trackId = parsedTFHD.trackId;
  193. // Get the timescale from the track Id
  194. if (this.trackIdToTimescale_.has(trackId)) {
  195. lastTRAF.timescale = this.trackIdToTimescale_.get(trackId);
  196. }
  197. })
  198. .fullBox('tfdt', (box) => {
  199. goog.asserts.assert(
  200. box.version != null,
  201. 'TFDT is a full box and should have a valid version.');
  202. const lastTRAF = parsedTRAFs[parsedTRAFs.length - 1];
  203. const parsedTFDT = shaka.util.Mp4BoxParsers.parseTFDTInaccurate(
  204. box.reader, box.version);
  205. lastTRAF.baseMediaDecodeTime = parsedTFDT.baseMediaDecodeTime;
  206. })
  207. .box('mdat', (box) => {
  208. const offset = moofOffset - box.start - 8;
  209. const initialPosition = box.reader.getPosition();
  210. for (const parsedTRAF of parsedTRAFs) {
  211. if (parsedTRAF.baseMediaDecodeTime === null) {
  212. // This field should have been populated by the Base Media Decode
  213. // Time in the tfdt box.
  214. shaka.log.alwaysWarn(
  215. 'Unable to find base media decode time for CEA captions!');
  216. throw new shaka.util.Error(
  217. shaka.util.Error.Severity.CRITICAL,
  218. shaka.util.Error.Category.TEXT,
  219. shaka.util.Error.Code.INVALID_MP4_CEA);
  220. }
  221. box.reader.seek(initialPosition);
  222. this.parseMdat_(box.reader,
  223. parsedTRAF.baseMediaDecodeTime,
  224. parsedTRAF.timescale,
  225. parsedTRAF.defaultSampleDuration,
  226. parsedTRAF.defaultSampleSize,
  227. offset,
  228. parsedTRAF.parsedTRUNs,
  229. captionPackets);
  230. }
  231. })
  232. .parse(mediaSegment, /* partialOkay= */ false);
  233. return captionPackets;
  234. }
  235. /**
  236. * Parse MDAT box.
  237. * @param {!shaka.util.DataViewReader} reader
  238. * @param {number} time
  239. * @param {number} timescale
  240. * @param {number} defaultSampleDuration
  241. * @param {number} defaultSampleSize
  242. * @param {number} offset
  243. * @param {!Array<shaka.util.ParsedTRUNBox>} parsedTRUNs
  244. * @param {!Array<!shaka.extern.ICeaParser.CaptionPacket>} captionPackets
  245. * @private
  246. */
  247. parseMdat_(reader, time, timescale, defaultSampleDuration,
  248. defaultSampleSize, offset, parsedTRUNs, captionPackets) {
  249. const BitstreamFormat = shaka.cea.Mp4CeaParser.BitstreamFormat;
  250. const CeaUtils = shaka.cea.CeaUtils;
  251. let sampleIndex = 0;
  252. // The fields in each ParsedTRUNSample contained in the sampleData
  253. // array are nullable. In the case of sample data and sample duration,
  254. // we use the defaults provided by the TREX/TFHD boxes. For sample
  255. // composition time offset, we default to 0.
  256. let sampleSize = defaultSampleSize;
  257. // Combine all sample data. This assumes that the samples described across
  258. // multiple trun boxes are still continuous in the mdat box.
  259. const sampleDatas = parsedTRUNs.map((t) => t.sampleData);
  260. const sampleData = [].concat(...sampleDatas);
  261. if (sampleData.length) {
  262. sampleSize = sampleData[0].sampleSize || defaultSampleSize;
  263. }
  264. reader.skip(offset + parsedTRUNs[0].dataOffset);
  265. while (reader.hasMoreData()) {
  266. const naluSize = reader.readUint32();
  267. const naluHeader = reader.readUint8();
  268. let naluType = null;
  269. let isSeiMessage = false;
  270. let naluHeaderSize = 1;
  271. goog.asserts.assert(this.bitstreamFormat_ != BitstreamFormat.UNKNOWN,
  272. 'Bitstream format should have been checked before now!');
  273. switch (this.bitstreamFormat_) {
  274. case BitstreamFormat.H264:
  275. naluType = naluHeader & 0x1f;
  276. isSeiMessage = naluType == CeaUtils.H264_NALU_TYPE_SEI;
  277. break;
  278. case BitstreamFormat.H265:
  279. naluHeaderSize = 2;
  280. reader.skip(1);
  281. naluType = (naluHeader >> 1) & 0x3f;
  282. isSeiMessage =
  283. naluType == CeaUtils.H265_PREFIX_NALU_TYPE_SEI ||
  284. naluType == CeaUtils.H265_SUFFIX_NALU_TYPE_SEI;
  285. break;
  286. case BitstreamFormat.H266:
  287. naluHeaderSize = 2;
  288. reader.skip(1);
  289. naluType = (naluHeader >> 1) & 0x3f;
  290. isSeiMessage =
  291. naluType == CeaUtils.H266_PREFIX_NALU_TYPE_SEI ||
  292. naluType == CeaUtils.H266_SUFFIX_NALU_TYPE_SEI;
  293. break;
  294. default:
  295. return;
  296. }
  297. if (isSeiMessage) {
  298. let timeOffset = 0;
  299. if (sampleIndex < sampleData.length) {
  300. timeOffset = sampleData[sampleIndex].sampleCompositionTimeOffset || 0;
  301. }
  302. const pts = (time + timeOffset) / timescale;
  303. for (const packet of this.seiProcessor_
  304. .process(reader.readBytes(naluSize - naluHeaderSize))) {
  305. captionPackets.push({
  306. packet,
  307. pts,
  308. });
  309. }
  310. } else {
  311. try {
  312. reader.skip(naluSize - naluHeaderSize);
  313. } catch (e) {
  314. // It is necessary to ignore this error because it can break the start
  315. // of playback even if the user does not want to see the subtitles.
  316. break;
  317. }
  318. }
  319. sampleSize -= (naluSize + 4);
  320. if (sampleSize == 0) {
  321. if (sampleIndex < sampleData.length) {
  322. time += sampleData[sampleIndex].sampleDuration ||
  323. defaultSampleDuration;
  324. } else {
  325. time += defaultSampleDuration;
  326. }
  327. sampleIndex++;
  328. if (sampleIndex < sampleData.length) {
  329. sampleSize = sampleData[sampleIndex].sampleSize || defaultSampleSize;
  330. } else {
  331. sampleSize = defaultSampleSize;
  332. }
  333. }
  334. }
  335. }
  336. /**
  337. * @param {string} codec A fourcc for a codec.
  338. * @private
  339. */
  340. setBitstreamFormat_(codec) {
  341. if (codec in shaka.cea.Mp4CeaParser.CodecBitstreamMap_) {
  342. this.bitstreamFormat_ = shaka.cea.Mp4CeaParser.CodecBitstreamMap_[codec];
  343. }
  344. }
  345. };
  346. /** @enum {number} */
  347. shaka.cea.Mp4CeaParser.BitstreamFormat = {
  348. UNKNOWN: 0,
  349. H264: 1,
  350. H265: 2,
  351. H266: 3,
  352. };
  353. /** @private {Object.<string, shaka.cea.Mp4CeaParser.BitstreamFormat>} */
  354. shaka.cea.Mp4CeaParser.CodecBitstreamMap_ = {
  355. // AVC
  356. 'avc1': shaka.cea.Mp4CeaParser.BitstreamFormat.H264,
  357. 'avc3': shaka.cea.Mp4CeaParser.BitstreamFormat.H264,
  358. // Dolby Vision based in AVC
  359. 'dvav': shaka.cea.Mp4CeaParser.BitstreamFormat.H264,
  360. 'dva1': shaka.cea.Mp4CeaParser.BitstreamFormat.H264,
  361. // HEVC
  362. 'hev1': shaka.cea.Mp4CeaParser.BitstreamFormat.H265,
  363. 'hvc1': shaka.cea.Mp4CeaParser.BitstreamFormat.H265,
  364. // Dolby Vision based in HEVC
  365. 'dvh1': shaka.cea.Mp4CeaParser.BitstreamFormat.H265,
  366. 'dvhe': shaka.cea.Mp4CeaParser.BitstreamFormat.H265,
  367. // VVC
  368. 'vvc1': shaka.cea.Mp4CeaParser.BitstreamFormat.H266,
  369. 'vvi1': shaka.cea.Mp4CeaParser.BitstreamFormat.H266,
  370. // Dolby Vision based in VVC
  371. 'dvc1': shaka.cea.Mp4CeaParser.BitstreamFormat.H266,
  372. 'dvi1': shaka.cea.Mp4CeaParser.BitstreamFormat.H266,
  373. };
  374. /**
  375. * @typedef {{
  376. * baseMediaDecodeTime: ?number,
  377. * defaultSampleDuration: number,
  378. * defaultSampleSize: number,
  379. * parsedTRUNs: !Array<shaka.util.ParsedTRUNBox>,
  380. * timescale: number
  381. * }}
  382. *
  383. * @property {?number} baseMediaDecodeTime
  384. * @property {number} defaultSampleDuration
  385. * @property {number} defaultSampleSize
  386. * @property {!Array<shaka.util.ParsedTRUNBox>} parsedTRUNs
  387. * @property {?number} timescale
  388. */
  389. shaka.cea.Mp4CeaParser.ParsedTRAF;
  390. shaka.media.ClosedCaptionParser.registerParser('video/mp4',
  391. () => new shaka.cea.Mp4CeaParser());