Note: It is strongly recommended that you start engaging with the relevant open source community at the start of Stage 2. It takes time to find the right people in the community and engage with them.
Chris Tyler – https://wiki.cdot.senecacollege.ca/wiki/Fall_2019_SPO600_Project#Stage_2
Last Wednesday, November 27th, I joined FFmpeg’s freenode IRC network development channel and asked how I could help out. Here is a screenshot of the chat log:
So I went ahead and looked for predefined compiler macros using grep:
$ egrep -ir "^#if.*aarch64" /home/yzhu132/FFmpeg/
$ egrep -ir "^#if.*arm" /home/yzhu132/FFmpeg/
And I also looked for header and C files that mentioned the top five routines mentioned in my Stage 1.1 using grep:
$ egrep -ir "*worker_func*" --include=\*.h --include=\*.c /home/yzhu132/FFmpeg/
$ egrep -ir "*thread_worker*" --include=\*.h --include=\*.c /home/yzhu132/FFmpeg/
$ egrep -ir "*estimate_motion_thread*" --include=\*.h --include=\*.c /home/yzhu132/FFmpeg/
$ egrep -ir "*ff_estimate_p_frame_motion*" --include=\*.h --include=\*.c /home/yzhu132/FFmpeg/
$ egrep -ir "*encode_thread*" --include=\*.h --include=\*.c /home/yzhu132/FFmpeg/
But that didn’t get me very far since none of those files matched any of the files with predefined compiler macros.
However, I remember I could use perf report to investigate which lines of code is responsible for the high overhead.
- 74.36% 0.01% ffmpeg_g ffmpeg_g [.] worker_func
- 74.35% worker_func
- 39.44% estimate_motion_thread
- 39.33% ff_estimate_p_frame_motion
- 13.99% ff_epzs_motion_search
+ 11.93% pix_abs16_c
1.29% _mcount@@GLIBC_2.18
- 10.20% sad_hpel_motion_search
+ 4.92% pix_abs16_xy2_c
+ 1.92% pix_abs16_x2_c
1.88% pix_abs16_y2_c
1.22% _mcount@@GLIBC_2.18
+ 3.96% sse16_c
+ 3.19% pix_abs16_c
+ 3.09% pix_sum_c
+ 2.52% pix_norm1_c
1.16% _mcount@@GLIBC_2.18
- 34.11% encode_thread
+ 10.13% diff_pixels_c
+ 8.38% pix_abs8_c
+ 4.44% ff_dct_quantize_c
+ 2.46% ff_mpv_motion
+ 1.87% ff_mpeg4_encode_mb
+ 1.84% ff_mpv_reconstruct_mb
+ 1.15% get_pixels_8_c
+ 0.94% ff_mpv_reallocate_putbitbuffer
+ 0.82% ff_h263_update_motion_val
And from that, I have identified the three inner routines responsible for worker_func’s larger overhead:
- ff_epzs_motion_search
- sad_hpel_motion_search
- encode_thread
With some closer inspection, I see pix_abs16_c
and its variances called several times within the worker_func
tree.
Here is pix_abs16_c
‘s assembly code:
Percent│ Disassembly of section .text:
│
│ 00000000000d6560 <_mcount@@GLIBC_2.18>:
│ _mcount@@GLIBC_2.18():
0.52 │ adrp x4, __libc_multiple_libcs
│ add x1, x4, #0x278
0.26 │ stp x29, x30, [sp, #-16]!
│ mov x5, #0x1 // #1
0.00 │ mov x29, sp
71.17 │ 14: ldaxr x2, [x1]
│ ↓ cbnz x2, 24
2.46 │ stxr w3, x5, [x1]
0.00 │ ↑ cbnz w3, 14
15.23 │ 24: cmp x2, #0x0
│ ↓ b.ne dc
0.83 │ ldr x3, [x1, #64]
0.22 │ ldr x2, [x1, #80]
│ sub x0, x0, x3
0.00 │ cmp x0, x2
│ ↓ b.hi d8
0.17 │ ldr x2, [x1, #96]
0.17 │ ldr x7, [x1, #24]
│ lsr x0, x0, x2
│ sxtw x0, w0
0.07 │ ldr x2, [x1, #40]
1.56 │ ldr x3, [x7, x0, lsl #3]
│ ↓ cbz x3, e4
0.17 │ add x3, x3, x3, lsl #1
│ lsl x3, x3, #3
│ add x5, x2, x3
3.70 │ ldr x1, [x2, x3]
│ cmp x30, x1
0.00 │ ↓ b.ne 8c
0.22 │ ↓ b 150
0.05 │ 78: add x6, x2, x3, lsl #3
0.28 │ ldr x3, [x2, x3, lsl #3]
│ cmp x30, x3
│ ↓ b.eq 12c
0.02 │ mov x5, x6
0.09 │ 8c: ldr x1, [x5, #16]
│ add x3, x1, x1, lsl #1
│ ↑ cbnz x1, 78
│ add x3, x4, #0x278
│ ldr x1, [x2, #16]
│ ldr x3, [x3, #56]
│ add x1, x1, #0x1
│ str x1, [x2, #16]
│ cmp x1, x3
│ ↓ b.cs fc
│ add x3, x1, x1, lsl #1
│ mov x6, #0x1 // #1
│ lsl x3, x3, #3
│ add x5, x2, x3
│ str x30, [x2, x3]
│ str x6, [x5, #8]
│ ldr x2, [x7, x0, lsl #3]
│ str x2, [x5, #16]
│ str x1, [x7, x0, lsl #3]
0.00 │ d8: str xzr, [x4, #632]
1.77 │ dc: ldp x29, x30, [sp], #16
│ ← ret
│ e4: ldr x3, [x2, #16]
│ ldr x6, [x1, #56]
│ add x1, x3, x5
│ str x1, [x2, #16]
│ cmp x1, x6
│ ↓ b.cc 10c
│ fc: mov x0, #0x2 // #2
│ str x0, [x4, #632]
│ ldp x29, x30, [sp], #16
│ ← ret
0.00 │10c: add x3, x1, x1, lsl #1
│ str x1, [x7, x0, lsl #3]
│ lsl x0, x3, #3
│ add x1, x2, x0
│ str x30, [x2, x0]
│ str xzr, [x4, #632]
│ stp x5, xzr, [x1, #8]
│ ↑ b dc
0.08 │12c: ldp x2, x3, [x6, #8]
│ add x2, x2, #0x1
0.00 │ str x2, [x6, #8]
0.03 │ str x3, [x5, #16]
0.02 │ ldr x2, [x7, x0, lsl #3]
0.03 │ str x2, [x6, #16]
0.01 │ str x1, [x7, x0, lsl #3]
0.03 │ str xzr, [x4, #632]
│ ↑ b dc
0.68 │150: ldr x0, [x5, #8]
0.07 │ str xzr, [x4, #632]
│ add x0, x0, #0x1
0.09 │ str x0, [x5, #8]
│ ↑ b dc
So here are some observations from searching the three inner routines and their functions in FFmpeg:
$ egrep -ir 'ff_epzs_motion_search|sad_hpel_motion_search|encode_thread|diff_pixels_c|pix_abs16' --include=\*.h --include=\*.c /home/yzhu132/FFmpeg/
/home/yzhu132/FFmpeg/libavcodec/alpha/me_cmp_alpha.c:int pix_abs16x16_mvi_asm(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h);
/home/yzhu132/FFmpeg/libavcodec/alpha/me_cmp_alpha.c:int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
/home/yzhu132/FFmpeg/libavcodec/alpha/me_cmp_alpha.c:static int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
/home/yzhu132/FFmpeg/libavcodec/alpha/me_cmp_alpha.c:static int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
/home/yzhu132/FFmpeg/libavcodec/alpha/me_cmp_alpha.c:static int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
/home/yzhu132/FFmpeg/libavcodec/alpha/me_cmp_alpha.c: c->sad[0] = pix_abs16x16_mvi_asm;
/home/yzhu132/FFmpeg/libavcodec/alpha/me_cmp_alpha.c: c->pix_abs[0][0] = pix_abs16x16_mvi_asm;
/home/yzhu132/FFmpeg/libavcodec/alpha/me_cmp_alpha.c: c->pix_abs[0][1] = pix_abs16x16_x2_mvi;
/home/yzhu132/FFmpeg/libavcodec/alpha/me_cmp_alpha.c: c->pix_abs[0][2] = pix_abs16x16_y2_mvi;
/home/yzhu132/FFmpeg/libavcodec/alpha/me_cmp_alpha.c: c->pix_abs[0][3] = pix_abs16x16_xy2_mvi;
/home/yzhu132/FFmpeg/libavcodec/arm/me_cmp_init_arm.c:int ff_pix_abs16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
/home/yzhu132/FFmpeg/libavcodec/arm/me_cmp_init_arm.c:int ff_pix_abs16_x2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
/home/yzhu132/FFmpeg/libavcodec/arm/me_cmp_init_arm.c:int ff_pix_abs16_y2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
/home/yzhu132/FFmpeg/libavcodec/arm/me_cmp_init_arm.c: c->pix_abs[0][0] = ff_pix_abs16_armv6;
/home/yzhu132/FFmpeg/libavcodec/arm/me_cmp_init_arm.c: c->pix_abs[0][1] = ff_pix_abs16_x2_armv6;
/home/yzhu132/FFmpeg/libavcodec/arm/me_cmp_init_arm.c: c->pix_abs[0][2] = ff_pix_abs16_y2_armv6;
/home/yzhu132/FFmpeg/libavcodec/arm/me_cmp_init_arm.c: c->sad[0] = ff_pix_abs16_armv6;
/home/yzhu132/FFmpeg/libavcodec/dnxhdenc.c:static int dnxhd_encode_thread(AVCodecContext *avctx, void *arg,
/home/yzhu132/FFmpeg/libavcodec/dnxhdenc.c: avctx->execute2(avctx, dnxhd_encode_thread, buf, NULL, ctx->m.mb_height);
/home/yzhu132/FFmpeg/libavcodec/me_cmp.c:static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
/home/yzhu132/FFmpeg/libavcodec/me_cmp.c:static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
/home/yzhu132/FFmpeg/libavcodec/me_cmp.c:static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
/home/yzhu132/FFmpeg/libavcodec/me_cmp.c:static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
/home/yzhu132/FFmpeg/libavcodec/me_cmp.c: c->pix_abs[0][0] = pix_abs16_c;
/home/yzhu132/FFmpeg/libavcodec/me_cmp.c: c->pix_abs[0][1] = pix_abs16_x2_c;
/home/yzhu132/FFmpeg/libavcodec/me_cmp.c: c->pix_abs[0][2] = pix_abs16_y2_c;
/home/yzhu132/FFmpeg/libavcodec/me_cmp.c: c->pix_abs[0][3] = pix_abs16_xy2_c;
/home/yzhu132/FFmpeg/libavcodec/me_cmp.c: c->sad[0] = pix_abs16_c;
/home/yzhu132/FFmpeg/libavcodec/mips/me_cmp_init_mips.c: c->pix_abs[0][0] = ff_pix_abs16_msa;
/home/yzhu132/FFmpeg/libavcodec/mips/me_cmp_init_mips.c: c->pix_abs[0][1] = ff_pix_abs16_x2_msa;
/home/yzhu132/FFmpeg/libavcodec/mips/me_cmp_init_mips.c: c->pix_abs[0][2] = ff_pix_abs16_y2_msa;
/home/yzhu132/FFmpeg/libavcodec/mips/me_cmp_init_mips.c: c->pix_abs[0][3] = ff_pix_abs16_xy2_msa;
/home/yzhu132/FFmpeg/libavcodec/mips/me_cmp_init_mips.c: c->sad[0] = ff_pix_abs16_msa;
/home/yzhu132/FFmpeg/libavcodec/mips/me_cmp_mips.h:int ff_pix_abs16_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
/home/yzhu132/FFmpeg/libavcodec/mips/me_cmp_mips.h:int ff_pix_abs16_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
/home/yzhu132/FFmpeg/libavcodec/mips/me_cmp_mips.h:int ff_pix_abs16_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
/home/yzhu132/FFmpeg/libavcodec/mips/me_cmp_mips.h:int ff_pix_abs16_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
/home/yzhu132/FFmpeg/libavcodec/mips/me_cmp_msa.c:int ff_pix_abs16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
/home/yzhu132/FFmpeg/libavcodec/mips/me_cmp_msa.c:int ff_pix_abs16_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
/home/yzhu132/FFmpeg/libavcodec/mips/me_cmp_msa.c:int ff_pix_abs16_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
/home/yzhu132/FFmpeg/libavcodec/mips/me_cmp_msa.c:int ff_pix_abs16_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
/home/yzhu132/FFmpeg/libavcodec/motion_est.c:static int sad_hpel_motion_search(MpegEncContext * s,
/home/yzhu132/FFmpeg/libavcodec/motion_est.c: c->sub_motion_search= sad_hpel_motion_search; // 2050 vs. 2450 cycles
/home/yzhu132/FFmpeg/libavcodec/motion_est.c:static int sad_hpel_motion_search(MpegEncContext * s,
/home/yzhu132/FFmpeg/libavcodec/motion_est.c: dmin = ff_epzs_motion_search(s, &mx, &my, P, 0, 0, s->p_mv_table, (1<<16)>>shift, 0, 16);
/home/yzhu132/FFmpeg/libavcodec/motion_est.c: dmin = ff_epzs_motion_search(s, &mx, &my, P, 0, 0, s->p_mv_table, (1<<16)>>shift, 0, 16);
/home/yzhu132/FFmpeg/libavcodec/motion_est.c: dmin = ff_epzs_motion_search(s, &mx, &my, P, 0, ref_index, s->p_mv_table, mv_scale, 0, 16);
/home/yzhu132/FFmpeg/libavcodec/motion_est.c: dmin = ff_epzs_motion_search(s, &mx, &my, P, 0, 0, mv_table, 1<<(16-shift), 0, 16);
/home/yzhu132/FFmpeg/libavcodec/motion_est.h:int ff_epzs_motion_search(struct MpegEncContext *s, int *mx_ptr, int *my_ptr,
/home/yzhu132/FFmpeg/libavcodec/motion_est_template.c:int ff_epzs_motion_search(MpegEncContext *s, int *mx_ptr, int *my_ptr,
/home/yzhu132/FFmpeg/libavcodec/mpegvideo_enc.c:static int encode_thread(AVCodecContext *c, void *arg){
/home/yzhu132/FFmpeg/libavcodec/mpegvideo_enc.c: s->avctx->execute(s->avctx, encode_thread, &s->thread_context[0], NULL, context_count, sizeof(void*));
/home/yzhu132/FFmpeg/libavcodec/pixblockdsp.c:static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
/home/yzhu132/FFmpeg/libavcodec/pixblockdsp.c: c->diff_pixels = diff_pixels_c;
/home/yzhu132/FFmpeg/libavcodec/snowenc.c: ref_score= ff_epzs_motion_search(&s->m, &ref_mx, &ref_my, P, 0, /*ref_index*/ 0, last_mv,
- All results are confined within the
libavcodec
directory - There are no results in the
libavcodec/x86
directory - The only results in the
libavcodec/arm
directory are pix_abs16 results - I’ve found only one common file that in both
x86
andarm
it’s theme_cmp_init.c
Comparing /arm/me_cmp_init_arm.c
and /x86/me_cmp_init.c
I’ll try adding some making some code changes to permit better optimization by the arm compiler.